commit
a38aa56e76
|
@ -94,3 +94,4 @@ THUNDERX2T99
|
|||
9.System Z:
|
||||
ZARCH_GENERIC
|
||||
Z13
|
||||
Z14
|
||||
|
|
|
@ -114,7 +114,14 @@ void get_cpuconfig(void)
|
|||
break;
|
||||
case CPU_Z14:
|
||||
printf("#define Z14\n");
|
||||
printf("#define L1_DATA_SIZE 131072\n");
|
||||
printf("#define L1_DATA_LINESIZE 256\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8\n");
|
||||
printf("#define L2_SIZE 4194304\n");
|
||||
printf("#define L2_LINESIZE 256\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
10
getarch.c
10
getarch.c
|
@ -1085,6 +1085,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "Z13"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_Z14
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ZARCH"
|
||||
#define SUBARCHITECTURE "Z14"
|
||||
#define ARCHCONFIG "-DZ14 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64"
|
||||
#define LIBNAME "z14"
|
||||
#define CORENAME "Z14"
|
||||
#endif
|
||||
|
||||
#ifndef FORCE
|
||||
|
||||
#ifdef USER_TARGET
|
||||
|
|
|
@ -48,6 +48,10 @@ ifeq ($(ARCH), zarch)
|
|||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), Z14)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -147,14 +147,14 @@ CSWAPKERNEL = cswap.c
|
|||
ZSWAPKERNEL = zswap.c
|
||||
#
|
||||
|
||||
#SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
SGEMVNKERNEL = sgemv_n.c
|
||||
DGEMVNKERNEL = dgemv_n.c
|
||||
#CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
CGEMVNKERNEL = cgemv_n.c
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
#
|
||||
#SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
DGEMVTKERNEL = dgemv_t.c
|
||||
#CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
CGEMVTKERNEL = cgemv_t.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,585 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
#define NBMAX 1024
|
||||
|
||||
|
||||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
|
||||
|
||||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
|
||||
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
|
||||
register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
|
||||
register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]};
|
||||
register __vector float vx2_r = {x[4], x[4],x[4], x[4]};
|
||||
register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]};
|
||||
register __vector float vx3_r = {x[6], x[6],x[6], x[6]};
|
||||
register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]};
|
||||
#else
|
||||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
|
||||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
|
||||
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
|
||||
register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
|
||||
register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]};
|
||||
register __vector float vx2_i = {x[5], x[5],x[5], x[5]};
|
||||
register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};
|
||||
register __vector float vx3_i = {x[7], x[7],x[7], x[7]};
|
||||
#endif
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vptr_a0 = (__vector float *) a0;
|
||||
register __vector float *vptr_a1 = (__vector float *) a1;
|
||||
register __vector float *vptr_a2 = (__vector float *) a2;
|
||||
register __vector float *vptr_a3 = (__vector float *) a3;
|
||||
BLASLONG i = 0;
|
||||
for (;i< n / 2; i+=2) {
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float va0 = vptr_a0[i];
|
||||
register __vector float va1 = vptr_a1[i];
|
||||
register __vector float va2 = vptr_a2[i];
|
||||
register __vector float va3 = vptr_a3[i];
|
||||
register __vector float va0_1 = vptr_a0[i + 1];
|
||||
register __vector float va1_1 = vptr_a1[i + 1];
|
||||
register __vector float va2_1 = vptr_a2[i + 1];
|
||||
register __vector float va3_1 = vptr_a3[i + 1];
|
||||
|
||||
vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;
|
||||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;
|
||||
va0 = vec_perm(va0, va0,swap_mask);
|
||||
va0_1 = vec_perm(va0_1, va0_1,swap_mask);
|
||||
va1 = vec_perm(va1, va1,swap_mask);
|
||||
va1_1 = vec_perm(va1_1, va1_1,swap_mask);
|
||||
va2 = vec_perm(va2, va2,swap_mask);
|
||||
va2_1 = vec_perm(va2_1, va2_1,swap_mask);
|
||||
va3 = vec_perm(va3, va3,swap_mask);
|
||||
va3_1 = vec_perm(va3_1, va3_1,swap_mask);
|
||||
vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;
|
||||
vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
|
||||
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
|
||||
register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
|
||||
register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]};
|
||||
#else
|
||||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
|
||||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
|
||||
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
|
||||
register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
|
||||
#endif
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vptr_a0 = (__vector float *) a0;
|
||||
register __vector float *vptr_a1 = (__vector float *) a1;
|
||||
BLASLONG i = 0;
|
||||
for (;i< n / 2; i+=2) {
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float va0 = vptr_a0[i];
|
||||
register __vector float va1 = vptr_a1[i];
|
||||
register __vector float va0_1 = vptr_a0[i + 1];
|
||||
register __vector float va1_1 = vptr_a1[i + 1];
|
||||
register __vector float va0x = vec_perm(va0, va0,swap_mask);
|
||||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
|
||||
register __vector float va1x = vec_perm(va1, va1,swap_mask);
|
||||
register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask);
|
||||
vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;
|
||||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
|
||||
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
|
||||
#else
|
||||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
|
||||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
|
||||
#endif
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vptr_a0 = (__vector float *) ap;
|
||||
BLASLONG i = 0;
|
||||
for (;i< n / 2; i+=2) {
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float va0 = vptr_a0[i];
|
||||
register __vector float va0_1 = vptr_a0[i + 1];
|
||||
register __vector float va0x = vec_perm(va0, va0,swap_mask);
|
||||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
|
||||
vy_0 += va0*vx0_r + va0x*vx0_i;
|
||||
vy_1 += va0_1*vx0_r + va0x_1*vx0_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
|
||||
|
||||
if (inc_dest != 2) {
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
#if !defined(XCONJ)
|
||||
temp_r = alpha_r * src[0] - alpha_i * src[1];
|
||||
temp_i = alpha_r * src[1] + alpha_i * src[0];
|
||||
#else
|
||||
temp_r = alpha_r * src[0] + alpha_i * src[1];
|
||||
temp_i = -alpha_r * src[1] + alpha_i * src[0];
|
||||
#endif
|
||||
|
||||
*dest += temp_r;
|
||||
*(dest+1) += temp_i;
|
||||
|
||||
src+=2;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
} else {
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
#if !defined(XCONJ)
|
||||
|
||||
register __vector float valpha_r = {alpha_r, alpha_r, alpha_r, alpha_r};
|
||||
register __vector float valpha_i = {-alpha_i, alpha_i, -alpha_i, alpha_i};
|
||||
|
||||
#else
|
||||
register __vector float valpha_r = {alpha_r, -alpha_r, alpha_r, -alpha_r};
|
||||
register __vector float valpha_i = {alpha_i, alpha_i, alpha_i, alpha_i};
|
||||
#endif
|
||||
|
||||
register __vector float *vptr_src = (__vector float *) src;
|
||||
register __vector float *vptr_y = (__vector float *) dest;
|
||||
for (i = 0; i < n/2; i += 2 ){
|
||||
|
||||
register __vector float vy_0 = vptr_y[i];
|
||||
register __vector float vy_1 = vptr_y[i +1];
|
||||
|
||||
register __vector float vsrc = vptr_src[i];
|
||||
register __vector float vsrc_1 = vptr_src[i + 1];
|
||||
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
|
||||
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
|
||||
|
||||
vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
|
||||
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i;
|
||||
vptr_y[i] = vy_0;
|
||||
vptr_y[i+1 ] = vy_1;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
|
||||
BLASLONG i;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT xbuffer[8], *ybuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
inc_x *= 2;
|
||||
inc_y *= 2;
|
||||
lda *= 2;
|
||||
|
||||
n1 = n / 4;
|
||||
n2 = n % 4;
|
||||
|
||||
m3 = m % 4;
|
||||
m1 = m - (m % 4);
|
||||
m2 = (m % NBMAX) - (m % 4);
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while (NB == NBMAX) {
|
||||
|
||||
m1 -= NB;
|
||||
if (m1 < 0) {
|
||||
if (m2 == 0) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
|
||||
x_ptr = x;
|
||||
|
||||
memset(ybuffer, 0, NB * 2*sizeof(FLOAT));
|
||||
|
||||
if (inc_x == 2) {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
cgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
|
||||
|
||||
a_ptr += lda << 2;
|
||||
x_ptr += 8;
|
||||
}
|
||||
|
||||
if (n2 & 2) {
|
||||
cgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
|
||||
x_ptr += 4;
|
||||
a_ptr += 2 * lda;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 1) {
|
||||
cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
|
||||
x_ptr += 2;
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
|
||||
xbuffer[0] = x_ptr[0];
|
||||
xbuffer[1] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
xbuffer[3] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[4] = x_ptr[0];
|
||||
xbuffer[5] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[6] = x_ptr[0];
|
||||
xbuffer[7] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
|
||||
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);
|
||||
|
||||
a_ptr += lda << 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < n2; i++) {
|
||||
xbuffer[0] = x_ptr[0];
|
||||
xbuffer[1] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
|
||||
a += 2 * NB;
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
|
||||
if (m3 == 0) return (0);
|
||||
|
||||
if (m3 == 1) {
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r = 0.0;
|
||||
FLOAT temp_i = 0.0;
|
||||
|
||||
if (lda == 2 && inc_x == 2) {
|
||||
|
||||
for (i = 0; i < (n & -2); i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
|
||||
temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
|
||||
temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
|
||||
#endif
|
||||
|
||||
a_ptr += 4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for (; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 2) {
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r0 = 0.0;
|
||||
FLOAT temp_i0 = 0.0;
|
||||
FLOAT temp_r1 = 0.0;
|
||||
FLOAT temp_i1 = 0.0;
|
||||
|
||||
if (lda == 4 && inc_x == 2) {
|
||||
|
||||
for (i = 0; i < (n & -2); i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
|
||||
temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
|
||||
temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
|
||||
temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
|
||||
temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
|
||||
temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
|
||||
temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
|
||||
temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
|
||||
#endif
|
||||
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for (; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 4;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
#endif
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 3) {
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r0 = 0.0;
|
||||
FLOAT temp_i0 = 0.0;
|
||||
FLOAT temp_r1 = 0.0;
|
||||
FLOAT temp_i1 = 0.0;
|
||||
FLOAT temp_r2 = 0.0;
|
||||
FLOAT temp_i2 = 0.0;
|
||||
|
||||
if (lda == 6 && inc_x == 2) {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 6;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
|
||||
y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
|
||||
y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
|
||||
#endif
|
||||
return (0);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
|
@ -0,0 +1,571 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 1024
|
||||
#include <altivec.h>
|
||||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
|
||||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
|
||||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0};
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* va1 = (__vector float*) a1;
|
||||
__vector float* va2 = (__vector float*) a2;
|
||||
__vector float* va3 = (__vector float*) a3;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
|
||||
for (i = 0; i < n / 2; i+=2) {
|
||||
register __vector float vx_0 = v_x[i];
|
||||
register __vector float vx_1 = v_x[i+1];
|
||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
|
||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
|
||||
|
||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
|
||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
|
||||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
|
||||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];
|
||||
vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1];
|
||||
vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1];
|
||||
vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1];
|
||||
vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1];
|
||||
|
||||
}
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
|
||||
|
||||
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
|
||||
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
|
||||
|
||||
register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3];
|
||||
register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3];
|
||||
|
||||
register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3];
|
||||
register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3];
|
||||
|
||||
#else
|
||||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
|
||||
|
||||
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
|
||||
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3];
|
||||
|
||||
register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3];
|
||||
register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3];
|
||||
|
||||
register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3];
|
||||
register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3];
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
|
||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
|
||||
y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
|
||||
y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
|
||||
y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
|
||||
|
||||
#else
|
||||
|
||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
|
||||
y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
|
||||
y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
|
||||
y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
|
||||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* va1 = (__vector float*) a1;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
|
||||
for (i = 0; i < n / 2; i+=2) {
|
||||
register __vector float vx_0 = v_x[i];
|
||||
register __vector float vx_1 = v_x[i+1];
|
||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
|
||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
|
||||
|
||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
|
||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
|
||||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
|
||||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];
|
||||
|
||||
}
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
|
||||
|
||||
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
|
||||
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
|
||||
|
||||
|
||||
#else
|
||||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
|
||||
|
||||
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
|
||||
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3];
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
|
||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
|
||||
#else
|
||||
|
||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
|
||||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
|
||||
__vector float* va0 = (__vector float*) ap;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
|
||||
for (i = 0; i < n / 2; i+=2) {
|
||||
register __vector float vx_0 = v_x[i];
|
||||
register __vector float vx_1 = v_x[i+1];
|
||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
|
||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
|
||||
|
||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
|
||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
|
||||
|
||||
}
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
|
||||
|
||||
#else
|
||||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
|
||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
|
||||
#else
|
||||
|
||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++) {
|
||||
*dest = *src;
|
||||
*(dest + 1) = *(src + 1);
|
||||
dest += 2;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT ybuffer[8], *xbuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
||||
inc_x <<= 1;
|
||||
inc_y <<= 1;
|
||||
lda <<= 1;
|
||||
|
||||
xbuffer = buffer;
|
||||
|
||||
n1 = n >> 2;
|
||||
n2 = n & 3;
|
||||
|
||||
m3 = m & 3;
|
||||
m1 = m - m3;
|
||||
m2 = (m & (NBMAX - 1)) - m3;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while (NB == NBMAX) {
|
||||
|
||||
m1 -= NB;
|
||||
if (m1 < 0) {
|
||||
if (m2 == 0) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
y_ptr = y;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
if (inc_x != 2)
|
||||
copy_x(NB, x_ptr, xbuffer, inc_x);
|
||||
else
|
||||
xbuffer = x_ptr;
|
||||
|
||||
if (inc_y == 2) {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
|
||||
a_ptr += lda << 2;
|
||||
y_ptr += 8;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 2) {
|
||||
cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
|
||||
a_ptr += lda << 1;
|
||||
y_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 1) {
|
||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
|
||||
a_ptr += lda;
|
||||
y_ptr += 2;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
memset(ybuffer, 0, sizeof (ybuffer));
|
||||
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
|
||||
|
||||
a_ptr += lda << 2;
|
||||
|
||||
y_ptr[0] += ybuffer[0];
|
||||
y_ptr[1] += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[2];
|
||||
y_ptr[1] += ybuffer[3];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[4];
|
||||
y_ptr[1] += ybuffer[5];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[6];
|
||||
y_ptr[1] += ybuffer[7];
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
for (i = 0; i < n2; i++) {
|
||||
memset(ybuffer, 0, sizeof (ybuffer));
|
||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
|
||||
a_ptr += lda;
|
||||
y_ptr[0] += ybuffer[0];
|
||||
y_ptr[1] += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
a += 2 * NB;
|
||||
x += NB * inc_x;
|
||||
}
|
||||
|
||||
if (m3 == 0) return (0);
|
||||
|
||||
x_ptr = x;
|
||||
j = 0;
|
||||
a_ptr = a;
|
||||
y_ptr = y;
|
||||
|
||||
if (m3 == 3) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x2 = x_ptr[0];
|
||||
FLOAT x3 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x4 = x_ptr[0];
|
||||
FLOAT x5 = x_ptr[1];
|
||||
while (j < n) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
|
||||
temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
|
||||
temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 2) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
FLOAT temp_r1;
|
||||
FLOAT temp_i1;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x2 = x_ptr[0];
|
||||
FLOAT x3 = x_ptr[1];
|
||||
|
||||
while (j < (n & -2)) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j += 2;
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 1) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
FLOAT temp_r1;
|
||||
FLOAT temp_i1;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
|
||||
while (j < (n & -2)) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j += 2;
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
|
@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 8192
|
||||
#define PREFETCH 1
|
||||
#define NBMAX 1024
|
||||
//#define PREFETCH 1
|
||||
#include <altivec.h>
|
||||
|
||||
#define HAVE_KERNEL4x8_ASM 1
|
||||
|
|
|
@ -36,9 +36,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
||||
|
||||
#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code
|
||||
|
||||
#if !defined(USE_MASK_PERMUTATIONS)
|
||||
|
||||
static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){
|
||||
__vector float result;
|
||||
__asm__ (
|
||||
"vmrgew %0,%1,%2;\n"
|
||||
: "=v" (result)
|
||||
: "v" (a),
|
||||
"v" (b)
|
||||
: );
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){
|
||||
__vector float result;
|
||||
__asm__ (
|
||||
"vmrgow %0,%1,%2;\n"
|
||||
: "=v" (result)
|
||||
: "v" (a),
|
||||
"v" (b)
|
||||
: );
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* Find maximum index
|
||||
* Warning: requirements n>0 and n % 32 == 0
|
||||
|
@ -51,12 +76,16 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
|
||||
BLASLONG index;
|
||||
BLASLONG i;
|
||||
#if defined(USE_MASK_PERMUTATIONS)
|
||||
register __vector unsigned int static_index0 = {0,1,2,3};
|
||||
#else
|
||||
register __vector unsigned int static_index0 = {2,0,3,1};
|
||||
#endif
|
||||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
|
||||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}
|
||||
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
|
||||
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
|
||||
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
|
||||
register __vector unsigned int static_index1=static_index0 +temp0;
|
||||
register __vector unsigned int static_index2=static_index0 +temp1;
|
||||
register __vector unsigned int static_index3=static_index1 +temp1;
|
||||
temp0=vec_xor(temp0,temp0);
|
||||
temp1=temp1 <<1 ; //{16,16,16,16}
|
||||
register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
|
||||
|
@ -64,9 +93,11 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
register __vector float quadruple_values={0,0,0,0};
|
||||
|
||||
register __vector float * v_ptrx=(__vector float *)x;
|
||||
#if defined(USE_MASK_PERMUTATIONS)
|
||||
register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27};
|
||||
register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
||||
for(; i<n; i+=32){
|
||||
#endif
|
||||
for(; i<n; i+=32 ){
|
||||
//absolute temporary complex vectors
|
||||
register __vector float v0=vec_abs(v_ptrx[0]);
|
||||
register __vector float v1=vec_abs(v_ptrx[1]);
|
||||
|
@ -78,8 +109,10 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
register __vector float v7=vec_abs(v_ptrx[7]);
|
||||
|
||||
//pack complex real and imaginary parts together to sum real+image
|
||||
#if defined(USE_MASK_PERMUTATIONS)
|
||||
register __vector float t1=vec_perm(v0,v1,real_pack_mask);
|
||||
register __vector float ti=vec_perm(v0,v1,image_pack_mask);
|
||||
register __vector float ti=vec_perm(v0,v1,image_pack_mask);
|
||||
|
||||
v0=t1+ti; //sum quadruple real with quadruple image
|
||||
register __vector float t2=vec_perm(v2,v3,real_pack_mask);
|
||||
register __vector float ti2=vec_perm(v2,v3,image_pack_mask);
|
||||
|
@ -90,6 +123,22 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
t2=vec_perm(v6,v7,real_pack_mask);
|
||||
ti2=vec_perm(v6,v7,image_pack_mask);
|
||||
v3=t2+ti2;
|
||||
#else
|
||||
register __vector float t1=mvec_mergee(v0,v1);
|
||||
register __vector float ti=mvec_mergeo(v0,v1);
|
||||
|
||||
v0=t1+ti; //sum quadruple real with quadruple image
|
||||
register __vector float t2= mvec_mergee(v2,v3);
|
||||
register __vector float ti2=mvec_mergeo(v2,v3);
|
||||
v1=t2+ti2;
|
||||
t1=mvec_mergee(v4,v5);
|
||||
ti=mvec_mergeo(v4,v5);
|
||||
v2=t1+ti; //sum
|
||||
t2=mvec_mergee(v6,v7);
|
||||
ti2=mvec_mergeo(v6,v7);
|
||||
v3=t2+ti2;
|
||||
|
||||
#endif
|
||||
// now we have 16 summed elements . lets compare them
|
||||
v_ptrx+=8;
|
||||
register __vector bool int r1=vec_cmpgt(v1,v0);
|
||||
|
@ -114,8 +163,10 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
v7=vec_abs(v_ptrx[7]);
|
||||
|
||||
//pack complex real and imaginary parts together to sum real+image
|
||||
#if defined(USE_MASK_PERMUTATIONS)
|
||||
t1=vec_perm(v0,v1,real_pack_mask);
|
||||
ti=vec_perm(v0,v1,image_pack_mask);
|
||||
ti=vec_perm(v0,v1,image_pack_mask);
|
||||
|
||||
v0=t1+ti; //sum quadruple real with quadruple image
|
||||
t2=vec_perm(v2,v3,real_pack_mask);
|
||||
ti2=vec_perm(v2,v3,image_pack_mask);
|
||||
|
@ -126,6 +177,22 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
t2=vec_perm(v6,v7,real_pack_mask);
|
||||
ti2=vec_perm(v6,v7,image_pack_mask);
|
||||
v3=t2+ti2;
|
||||
#else
|
||||
t1=mvec_mergee(v0,v1);
|
||||
ti=mvec_mergeo(v0,v1);
|
||||
|
||||
v0=t1+ti; //sum quadruple real with quadruple image
|
||||
t2=mvec_mergee(v2,v3);
|
||||
ti2=mvec_mergeo(v2,v3);
|
||||
v1=t2+ti2;
|
||||
t1=mvec_mergee(v4,v5);
|
||||
ti=mvec_mergeo(v4,v5);
|
||||
v2=t1+ti; //sum
|
||||
t2=mvec_mergee(v6,v7);
|
||||
ti2=mvec_mergeo(v6,v7);
|
||||
v3=t2+ti2;
|
||||
|
||||
#endif
|
||||
// now we have 16 summed elements {from 16 to 31} . lets compare them
|
||||
v_ptrx+=8;
|
||||
r1=vec_cmpgt(v1,v0);
|
||||
|
|
|
@ -0,0 +1,465 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 4096
|
||||
|
||||
static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3;
|
||||
FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
b0 = a0 + lda4 ;
|
||||
b1 = a1 + lda4 ;
|
||||
b2 = a2 + lda4 ;
|
||||
b3 = a3 + lda4 ;
|
||||
x0 = xo[0] * *alpha;
|
||||
x1 = xo[1] * *alpha;
|
||||
x2 = xo[2] * *alpha;
|
||||
x3 = xo[3] * *alpha;
|
||||
x4 = xo[4] * *alpha;
|
||||
x5 = xo[5] * *alpha;
|
||||
x6 = xo[6] * *alpha;
|
||||
x7 = xo[7] * *alpha;
|
||||
__vector float* va0 = (__vector float*)a0;
|
||||
__vector float* va1 = (__vector float*)a1;
|
||||
__vector float* va2 = (__vector float*)a2;
|
||||
__vector float* va3 = (__vector float*)a3;
|
||||
__vector float* vb0 = (__vector float*)b0;
|
||||
__vector float* vb1 = (__vector float*)b1;
|
||||
__vector float* vb2 = (__vector float*)b2;
|
||||
__vector float* vb3 = (__vector float*)b3;
|
||||
|
||||
__vector float v_x0 = {x0,x0,x0,x0};
|
||||
__vector float v_x1 = {x1,x1,x1,x1};
|
||||
__vector float v_x2 = {x2,x2,x2,x2};
|
||||
__vector float v_x3 = {x3,x3,x3,x3};
|
||||
__vector float v_x4 = {x4,x4,x4,x4};
|
||||
__vector float v_x5 = {x5,x5,x5,x5};
|
||||
__vector float v_x6 = {x6,x6,x6,x6};
|
||||
__vector float v_x7 = {x7,x7,x7,x7};
|
||||
__vector float* v_y =(__vector float*)y;
|
||||
|
||||
for ( i=0; i< n/4; i++)
|
||||
{
|
||||
register __vector float vy=v_y[i];
|
||||
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ;
|
||||
vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ;
|
||||
v_y[i] =vy;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT x0,x1,x2,x3;
|
||||
x0 = xo[0] * *alpha;
|
||||
x1 = xo[1] * *alpha;
|
||||
x2 = xo[2] * *alpha;
|
||||
x3 = xo[3] * *alpha;
|
||||
__vector float v_x0 = {x0,x0,x0,x0};
|
||||
__vector float v_x1 = {x1,x1,x1,x1};
|
||||
__vector float v_x2 = {x2,x2,x2,x2};
|
||||
__vector float v_x3 = {x3,x3,x3,x3};
|
||||
__vector float* v_y =(__vector float*)y;
|
||||
__vector float* va0 = (__vector float*)ap[0];
|
||||
__vector float* va1 = (__vector float*)ap[1];
|
||||
__vector float* va2 = (__vector float*)ap[2];
|
||||
__vector float* va3 = (__vector float*)ap[3];
|
||||
|
||||
for ( i=0; i< n/4; i++ )
|
||||
{
|
||||
register __vector float vy=v_y[i];
|
||||
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ;
|
||||
v_y[i] =vy;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT x0,x1;
|
||||
x0 = x[0] * *alpha;
|
||||
x1 = x[1] * *alpha;
|
||||
__vector float v_x0 = {x0,x0,x0,x0};
|
||||
__vector float v_x1 = {x1,x1,x1,x1};
|
||||
__vector float* v_y =(__vector float*)y;
|
||||
__vector float* va0 = (__vector float*)ap[0];
|
||||
__vector float* va1 = (__vector float*)ap[1];
|
||||
|
||||
for ( i=0; i< n/4; i++ )
|
||||
{
|
||||
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT x0 ;
|
||||
x0 = x[0] * *alpha;
|
||||
__vector float v_x0 = {x0,x0,x0,x0};
|
||||
__vector float* v_y =(__vector float*)y;
|
||||
__vector float* va0 = (__vector float*)ap;
|
||||
|
||||
for ( i=0; i< n/4; i++ )
|
||||
{
|
||||
v_y[i] += v_x0 * va0[i] ;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
for ( i=0; i<n; i++ ){
|
||||
*dest += *src;
|
||||
src++;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
FLOAT *ap[4];
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4 = lda << 2;
|
||||
BLASLONG lda8 = lda << 3;
|
||||
FLOAT xbuffer[8],*ybuffer;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
n1 = n >> 3 ;
|
||||
n2 = n & 7 ;
|
||||
}
|
||||
else
|
||||
{
|
||||
n1 = n >> 2 ;
|
||||
n2 = n & 3 ;
|
||||
|
||||
}
|
||||
|
||||
m3 = m & 3 ;
|
||||
m1 = m & -4 ;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
|
||||
if ( inc_y != 1 )
|
||||
memset(ybuffer,0,NB*4);
|
||||
else
|
||||
ybuffer = y_ptr;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
|
||||
ap[0] += lda8;
|
||||
ap[1] += lda8;
|
||||
ap[2] += lda8;
|
||||
ap[3] += lda8;
|
||||
a_ptr += lda8;
|
||||
x_ptr += 8;
|
||||
}
|
||||
|
||||
|
||||
if ( n2 & 4 )
|
||||
{
|
||||
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
|
||||
a_ptr += lda*2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
|
||||
a_ptr += lda;
|
||||
x_ptr += 1;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[1] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[3] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
}
|
||||
|
||||
for( i = 0; i < n2 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
a += NB;
|
||||
if ( inc_y != 1 )
|
||||
{
|
||||
add_y(NB,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
else
|
||||
y_ptr += NB ;
|
||||
|
||||
}
|
||||
|
||||
if ( m3 == 0 ) return(0);
|
||||
|
||||
if ( m3 == 3 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
if ( lda == 3 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < ( n & -4 ); i+=4 )
|
||||
{
|
||||
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
|
||||
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
|
||||
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
|
||||
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
|
||||
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
|
||||
|
||||
a_ptr += 12;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += 3;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp2;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
if ( m3 == 2 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
if ( lda == 2 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4) ; i+=4 )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
|
||||
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += 2;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
return(0);
|
||||
}
|
||||
|
||||
if ( m3 == 1 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp = 0.0;
|
||||
if ( lda == 1 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4); i+=4 )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
|
||||
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i];
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[0] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp;
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,513 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
/****Note***
|
||||
UnUsed kernel
|
||||
This kernel works. But it was not competitive enough to be added in production
|
||||
It could be used and tested in future or could provide barebone for switching to inline assembly
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 4096
|
||||
|
||||
static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3;
|
||||
FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
|
||||
a0 = ap[0];
|
||||
a1 = ap[1];
|
||||
a2 = ap[2];
|
||||
a3 = ap[3];
|
||||
b0 = a0 + lda4 ;
|
||||
b1 = a1 + lda4 ;
|
||||
b2 = a2 + lda4 ;
|
||||
b3 = a3 + lda4 ;
|
||||
x0 = xo[0] * *alpha;
|
||||
x1 = xo[1] * *alpha;
|
||||
x2 = xo[2] * *alpha;
|
||||
x3 = xo[3] * *alpha;
|
||||
x4 = xo[4] * *alpha;
|
||||
x5 = xo[5] * *alpha;
|
||||
x6 = xo[6] * *alpha;
|
||||
x7 = xo[7] * *alpha;
|
||||
__vector float* va0 = (__vector float*)a0;
|
||||
__vector float* va1 = (__vector float*)a1;
|
||||
__vector float* va2 = (__vector float*)a2;
|
||||
__vector float* va3 = (__vector float*)a3;
|
||||
__vector float* vb0 = (__vector float*)b0;
|
||||
__vector float* vb1 = (__vector float*)b1;
|
||||
__vector float* vb2 = (__vector float*)b2;
|
||||
__vector float* vb3 = (__vector float*)b3;
|
||||
|
||||
register __vector float v_x0 = {x0,x0,x0,x0};
|
||||
register __vector float v_x1 = {x1,x1,x1,x1};
|
||||
register __vector float v_x2 = {x2,x2,x2,x2};
|
||||
register __vector float v_x3 = {x3,x3,x3,x3};
|
||||
register __vector float v_x4 = {x4,x4,x4,x4};
|
||||
register __vector float v_x5 = {x5,x5,x5,x5};
|
||||
register __vector float v_x6 = {x6,x6,x6,x6};
|
||||
register __vector float v_x7 = {x7,x7,x7,x7};
|
||||
__vector float* v_y =(__vector float*)y;
|
||||
|
||||
for ( i=0; i< n/4; i+=2)
|
||||
{
|
||||
register __vector float vy_1=v_y[i];
|
||||
register __vector float vy_2=v_y[i+1];
|
||||
register __vector float va0_1=va0[i] ;
|
||||
register __vector float va0_2=va0[i+1] ;
|
||||
register __vector float va1_1=va1[i] ;
|
||||
register __vector float va1_2=va1[i+1] ;
|
||||
register __vector float va2_1=va2[i] ;
|
||||
register __vector float va2_2=va2[i+1] ;
|
||||
register __vector float va3_1=va3[i] ;
|
||||
register __vector float va3_2=va3[i+1] ;
|
||||
register __vector float vb0_1=vb0[i] ;
|
||||
register __vector float vb0_2=vb0[i+1] ;
|
||||
register __vector float vb1_1=vb1[i] ;
|
||||
register __vector float vb1_2=vb1[i+1] ;
|
||||
register __vector float vb2_1=vb2[i] ;
|
||||
register __vector float vb2_2=vb2[i+1] ;
|
||||
register __vector float vb3_1=vb3[i] ;
|
||||
register __vector float vb3_2=vb3[i+1] ;
|
||||
vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ;
|
||||
vy_1 += v_x4 * vb0_1 + v_x5 * vb1_1 + v_x6 * vb2_1 + v_x7 * vb3_1 ;
|
||||
vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ;
|
||||
vy_2 += v_x4 * vb0_2 + v_x5 * vb1_2 + v_x6 * vb2_2 + v_x7 * vb3_2 ;
|
||||
v_y[i] =vy_1;
|
||||
v_y[i+1] =vy_2;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT x0,x1,x2,x3;
|
||||
x0 = xo[0] * *alpha;
|
||||
x1 = xo[1] * *alpha;
|
||||
x2 = xo[2] * *alpha;
|
||||
x3 = xo[3] * *alpha;
|
||||
__vector float v_x0 = {x0,x0,x0,x0};
|
||||
__vector float v_x1 = {x1,x1,x1,x1};
|
||||
__vector float v_x2 = {x2,x2,x2,x2};
|
||||
__vector float v_x3 = {x3,x3,x3,x3};
|
||||
__vector float* v_y =(__vector float*)y;
|
||||
__vector float* va0 = (__vector float*)ap[0];
|
||||
__vector float* va1 = (__vector float*)ap[1];
|
||||
__vector float* va2 = (__vector float*)ap[2];
|
||||
__vector float* va3 = (__vector float*)ap[3];
|
||||
|
||||
for ( i=0; i< n/4; i+=2 )
|
||||
{
|
||||
register __vector float vy_1=v_y[i];
|
||||
register __vector float vy_2=v_y[i+1];
|
||||
register __vector float va0_1=va0[i] ;
|
||||
register __vector float va0_2=va0[i+1] ;
|
||||
register __vector float va1_1=va1[i] ;
|
||||
register __vector float va1_2=va1[i+1] ;
|
||||
register __vector float va2_1=va2[i] ;
|
||||
register __vector float va2_2=va2[i+1] ;
|
||||
register __vector float va3_1=va3[i] ;
|
||||
register __vector float va3_2=va3[i+1] ;
|
||||
vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ;
|
||||
vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ;
|
||||
v_y[i] =vy_1;
|
||||
v_y[i+1] =vy_2;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT x0,x1;
|
||||
x0 = x[0] * *alpha;
|
||||
x1 = x[1] * *alpha;
|
||||
__vector float v_x0 = {x0,x0,x0,x0};
|
||||
__vector float v_x1 = {x1,x1,x1,x1};
|
||||
__vector float* v_y =(__vector float*)y;
|
||||
__vector float* va0 = (__vector float*)ap[0];
|
||||
__vector float* va1 = (__vector float*)ap[1];
|
||||
|
||||
for ( i=0; i< n/4; i+=2 )
|
||||
{
|
||||
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ;
|
||||
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT x0 ;
|
||||
x0 = x[0] * *alpha;
|
||||
__vector float v_x0 = {x0,x0,x0,x0};
|
||||
__vector float* v_y =(__vector float*)y;
|
||||
__vector float* va0 = (__vector float*)ap;
|
||||
|
||||
for ( i=0; i< n/4; i+=2 )
|
||||
{
|
||||
v_y[i] += v_x0 * va0[i] ;
|
||||
v_y[i+1] += v_x0 * va0[i+1] ;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
for ( i=0; i<n; i++ ){
|
||||
*dest += *src;
|
||||
src++;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
FLOAT *ap[4];
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4 = lda << 2;
|
||||
BLASLONG lda8 = lda << 3;
|
||||
FLOAT xbuffer[8],*ybuffer;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
n1 = n >> 3 ;
|
||||
n2 = n & 7 ;
|
||||
}
|
||||
else
|
||||
{
|
||||
n1 = n >> 2 ;
|
||||
n2 = n & 3 ;
|
||||
|
||||
}
|
||||
|
||||
m3 = m & 7 ;
|
||||
m1 = m - m3;
|
||||
m2 = (m & (NBMAX-1)) - m3 ;
|
||||
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while ( NB == NBMAX )
|
||||
{
|
||||
|
||||
m1 -= NB;
|
||||
if ( m1 < 0)
|
||||
{
|
||||
if ( m2 == 0 ) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
ap[0] = a_ptr;
|
||||
ap[1] = a_ptr + lda;
|
||||
ap[2] = ap[1] + lda;
|
||||
ap[3] = ap[2] + lda;
|
||||
|
||||
if ( inc_y != 1 )
|
||||
memset(ybuffer,0,NB*4);
|
||||
else
|
||||
ybuffer = y_ptr;
|
||||
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
|
||||
ap[0] += lda8;
|
||||
ap[1] += lda8;
|
||||
ap[2] += lda8;
|
||||
ap[3] += lda8;
|
||||
a_ptr += lda8;
|
||||
x_ptr += 8;
|
||||
}
|
||||
|
||||
|
||||
if ( n2 & 4 )
|
||||
{
|
||||
sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
if ( n2 & 2 )
|
||||
{
|
||||
sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha);
|
||||
a_ptr += lda*2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
|
||||
if ( n2 & 1 )
|
||||
{
|
||||
sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
|
||||
a_ptr += lda;
|
||||
x_ptr += 1;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n1 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[1] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[3] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha);
|
||||
ap[0] += lda4;
|
||||
ap[1] += lda4;
|
||||
ap[2] += lda4;
|
||||
ap[3] += lda4;
|
||||
a_ptr += lda4;
|
||||
}
|
||||
|
||||
for( i = 0; i < n2 ; i++)
|
||||
{
|
||||
xbuffer[0] = x_ptr[0];
|
||||
x_ptr += inc_x;
|
||||
sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
a += NB;
|
||||
if ( inc_y != 1 )
|
||||
{
|
||||
add_y(NB,ybuffer,y_ptr,inc_y);
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
else
|
||||
y_ptr += NB ;
|
||||
|
||||
}
|
||||
|
||||
|
||||
if ( m3 & 4 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
FLOAT temp3 = 0.0;
|
||||
if ( lda == 4 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < ( n & -4 ); i+=4 )
|
||||
{
|
||||
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1];
|
||||
temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1];
|
||||
|
||||
temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12] * x_ptr[3];
|
||||
temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3];
|
||||
temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3];
|
||||
temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3];
|
||||
|
||||
a_ptr += 16;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
temp3 += a_ptr[3] * x_ptr[0] ;
|
||||
a_ptr +=4;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
temp2 += a_ptr[2] * x_ptr[0];
|
||||
temp3 += a_ptr[3] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp2;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp3;
|
||||
y_ptr += inc_y;
|
||||
a += 4;
|
||||
}
|
||||
|
||||
|
||||
if ( m3 & 2 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp0 = 0.0;
|
||||
FLOAT temp1 = 0.0;
|
||||
if ( lda == 2 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4) ; i+=4 )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
|
||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
|
||||
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += 2;
|
||||
x_ptr ++;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp0 += a_ptr[0] * x_ptr[0];
|
||||
temp1 += a_ptr[1] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha * temp1;
|
||||
y_ptr += inc_y;
|
||||
a += 2;
|
||||
}
|
||||
|
||||
if ( m3 & 1 )
|
||||
{
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp = 0.0;
|
||||
if ( lda == 1 && inc_x ==1 )
|
||||
{
|
||||
|
||||
for( i = 0; i < (n & -4); i+=4 )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
|
||||
|
||||
}
|
||||
|
||||
for( ; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[i] * x_ptr[i];
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
for( i = 0; i < n; i++ )
|
||||
{
|
||||
temp += a_ptr[0] * x_ptr[0];
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
y_ptr[0] += alpha * temp;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,480 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
#include <altivec.h>
|
||||
|
||||
static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
|
||||
__vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
|
||||
register __vector float temp0 = {0,0,0,0};
|
||||
register __vector float temp1 = {0,0,0,0};
|
||||
register __vector float temp2 = {0,0,0,0};
|
||||
register __vector float temp3 = {0,0,0,0};
|
||||
register __vector float temp4 = {0,0,0,0};
|
||||
register __vector float temp5 = {0,0,0,0};
|
||||
register __vector float temp6 = {0,0,0,0};
|
||||
register __vector float temp7 = {0,0,0,0};
|
||||
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
a4 = a3 + lda;
|
||||
a5 = a4 + lda;
|
||||
a6 = a5 + lda;
|
||||
a7 = a6 + lda;
|
||||
va0 = (__vector float*) a0;
|
||||
va1 = (__vector float*) a1;
|
||||
va2 = (__vector float*) a2;
|
||||
va3 = (__vector float*) a3;
|
||||
va4 = (__vector float*) a4;
|
||||
va5 = (__vector float*) a5;
|
||||
va6 = (__vector float*) a6;
|
||||
va7 = (__vector float*) a7;
|
||||
v_x = (__vector float*) x;
|
||||
|
||||
|
||||
for (i = 0; i < n/4; i ++) {
|
||||
temp0 += v_x[i] * va0[i];
|
||||
temp1 += v_x[i] * va1[i];
|
||||
temp2 += v_x[i] * va2[i];
|
||||
temp3 += v_x[i] * va3[i];
|
||||
temp4 += v_x[i] * va4[i];
|
||||
temp5 += v_x[i] * va5[i];
|
||||
temp6 += v_x[i] * va6[i];
|
||||
temp7 += v_x[i] * va7[i];
|
||||
}
|
||||
|
||||
|
||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
|
||||
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
|
||||
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
|
||||
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
|
||||
|
||||
y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
|
||||
y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
|
||||
y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
|
||||
y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
BLASLONG i = 0;
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* va1 = (__vector float*) a1;
|
||||
__vector float* va2 = (__vector float*) a2;
|
||||
__vector float* va3 = (__vector float*) a3;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
register __vector float temp0 = {0,0,0,0};
|
||||
register __vector float temp1 = {0,0,0,0};
|
||||
register __vector float temp2 = {0,0,0,0};
|
||||
register __vector float temp3 = {0,0,0,0};
|
||||
|
||||
for (i = 0; i < n / 4; i ++) {
|
||||
temp0 += v_x[i] * va0[i];
|
||||
temp1 += v_x[i] * va1[i];
|
||||
temp2 += v_x[i] * va2[i];
|
||||
temp3 += v_x[i] * va3[i];
|
||||
}
|
||||
|
||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
|
||||
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
|
||||
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
|
||||
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* va1 = (__vector float*) a1;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
__vector float temp0 = {0,0,0,0};
|
||||
__vector float temp1 = {0,0,0,0};
|
||||
for (i = 0; i < n / 4; i ++) {
|
||||
temp0 += v_x[i] * va0[i];
|
||||
temp1 += v_x[i] * va1[i];
|
||||
}
|
||||
|
||||
|
||||
|
||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
|
||||
y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
|
||||
}
|
||||
|
||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *a0;
|
||||
a0 = ap;
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
__vector float temp0 = {0,0,0,0};
|
||||
for (i = 0; i < n / 4; i ++) {
|
||||
temp0 += v_x[i] * va0[i] ;
|
||||
}
|
||||
|
||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
|
||||
|
||||
}
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++) {
|
||||
*dest++ = *src;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT ybuffer[8], *xbuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
||||
xbuffer = buffer;
|
||||
|
||||
n1 = n >> 3;
|
||||
n2 = n & 7;
|
||||
|
||||
m3 = m & 3;
|
||||
m1 = m - m3;
|
||||
m2 = (m & (NBMAX - 1)) - m3;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while (NB == NBMAX) {
|
||||
|
||||
m1 -= NB;
|
||||
if (m1 < 0) {
|
||||
if (m2 == 0) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
y_ptr = y;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
if (inc_x != 1)
|
||||
copy_x(NB, x_ptr, xbuffer, inc_x);
|
||||
else
|
||||
xbuffer = x_ptr;
|
||||
|
||||
BLASLONG lda8 = lda << 3;
|
||||
|
||||
|
||||
if (inc_y == 1) {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
|
||||
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
|
||||
|
||||
y_ptr += 8;
|
||||
a_ptr += lda8;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
ybuffer[0] = 0;
|
||||
ybuffer[1] = 0;
|
||||
ybuffer[2] = 0;
|
||||
ybuffer[3] = 0;
|
||||
ybuffer[4] = 0;
|
||||
ybuffer[5] = 0;
|
||||
ybuffer[6] = 0;
|
||||
ybuffer[7] = 0;
|
||||
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
|
||||
|
||||
|
||||
|
||||
*y_ptr += ybuffer[0];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[2];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[3];
|
||||
y_ptr += inc_y;
|
||||
|
||||
*y_ptr += ybuffer[4];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[5];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[6];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[7];
|
||||
y_ptr += inc_y;
|
||||
|
||||
a_ptr += lda8;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (n2 & 4) {
|
||||
ybuffer[0] = 0;
|
||||
ybuffer[1] = 0;
|
||||
ybuffer[2] = 0;
|
||||
ybuffer[3] = 0;
|
||||
sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
|
||||
|
||||
a_ptr += lda<<2;
|
||||
|
||||
*y_ptr += ybuffer[0];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[2];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[3];
|
||||
y_ptr += inc_y;
|
||||
}
|
||||
|
||||
if (n2 & 2) {
|
||||
sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
|
||||
a_ptr += lda << 1;
|
||||
y_ptr += 2 * inc_y;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 1) {
|
||||
sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
a += NB;
|
||||
x += NB * inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
if (m3 == 0) return (0);
|
||||
|
||||
x_ptr = x;
|
||||
a_ptr = a;
|
||||
if (m3 == 3) {
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp2 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if (lda == 3 && inc_y == 1) {
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
|
||||
y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
|
||||
y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
|
||||
aj += 12;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
aj += 3;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
if (inc_y == 1) {
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
|
||||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
|
||||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
|
||||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (j = 0; j < n; j++) {
|
||||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 2) {
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if (lda == 2 && inc_y == 1) {
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
|
||||
y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
|
||||
y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
|
||||
y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
|
||||
aj += 8;
|
||||
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
|
||||
aj += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
if (inc_y == 1) {
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
|
||||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
|
||||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
|
||||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
for (j = 0; j < n; j++) {
|
||||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
||||
FLOAT xtemp = *x_ptr * alpha;
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
if (lda == 1 && inc_y == 1) {
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
y_ptr[j + 1] += aj[j + 1] * xtemp;
|
||||
y_ptr[j + 2] += aj[j + 2] * xtemp;
|
||||
y_ptr[j + 3] += aj[j + 3] * xtemp;
|
||||
}
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
if (inc_y == 1) {
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
y_ptr[j + 1] += *(aj + lda) * xtemp;
|
||||
y_ptr[j + 2] += *(aj + lda2) * xtemp;
|
||||
y_ptr[j + 3] += *(aj + lda3) * xtemp;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
for (j = 0; j < n; j++) {
|
||||
*y_ptr += *aj * xtemp;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,508 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
/****Note***
|
||||
UnUsed kernel
|
||||
This kernel works. But it was not competitive enough to be added in production
|
||||
It could be used and tested in future or could be used as base for switching to inline assembly
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
#include <stdio.h>
|
||||
#define NBMAX 4096
|
||||
|
||||
#include <altivec.h>
|
||||
|
||||
static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
|
||||
__vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
|
||||
register __vector float temp0 = {0,0,0,0};
|
||||
register __vector float temp1 = {0,0,0,0};
|
||||
register __vector float temp2 = {0,0,0,0};
|
||||
register __vector float temp3 = {0,0,0,0};
|
||||
register __vector float temp4 = {0,0,0,0};
|
||||
register __vector float temp5 = {0,0,0,0};
|
||||
register __vector float temp6 = {0,0,0,0};
|
||||
register __vector float temp7 = {0,0,0,0};
|
||||
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
a4 = a3 + lda;
|
||||
a5 = a4 + lda;
|
||||
a6 = a5 + lda;
|
||||
a7 = a6 + lda;
|
||||
va0 = (__vector float*) a0;
|
||||
va1 = (__vector float*) a1;
|
||||
va2 = (__vector float*) a2;
|
||||
va3 = (__vector float*) a3;
|
||||
va4 = (__vector float*) a4;
|
||||
va5 = (__vector float*) a5;
|
||||
va6 = (__vector float*) a6;
|
||||
va7 = (__vector float*) a7;
|
||||
v_x = (__vector float*) x;
|
||||
|
||||
|
||||
for (i = 0; i < n/4; i +=2) {
|
||||
register __vector float vx1=v_x[i] ;
|
||||
register __vector float vx2=v_x[i+1] ;
|
||||
register __vector float va0_1=va0[i] ;
|
||||
register __vector float va0_2=va0[i+1] ;
|
||||
register __vector float va1_1=va1[i] ;
|
||||
register __vector float va1_2=va1[i+1] ;
|
||||
register __vector float va2_1=va2[i] ;
|
||||
register __vector float va2_2=va2[i+1] ;
|
||||
register __vector float va3_1=va3[i] ;
|
||||
register __vector float va3_2=va3[i+1] ;
|
||||
register __vector float va4_1=va4[i] ;
|
||||
register __vector float va4_2=va4[i+1] ;
|
||||
register __vector float va5_1=va5[i] ;
|
||||
register __vector float va5_2=va5[i+1] ;
|
||||
register __vector float va6_1=va6[i] ;
|
||||
register __vector float va6_2=va6[i+1] ;
|
||||
register __vector float va7_1=va7[i] ;
|
||||
register __vector float va7_2=va7[i+1] ;
|
||||
temp0 += vx1* va0_1 + vx2 * va0_2;
|
||||
temp1 += vx1* va1_1 + vx2 * va1_2;
|
||||
temp2 += vx1* va2_1 + vx2 * va2_2;
|
||||
temp3 += vx1* va3_1 + vx2 * va3_2;
|
||||
temp4 += vx1* va4_1 + vx2 * va4_2;
|
||||
temp5 += vx1* va5_1 + vx2 * va5_2;
|
||||
temp6 += vx1* va6_1 + vx2 * va6_2;
|
||||
temp7 += vx1* va7_1 + vx2 * va7_2;
|
||||
}
|
||||
|
||||
|
||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
|
||||
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
|
||||
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
|
||||
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
|
||||
|
||||
y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
|
||||
y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
|
||||
y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
|
||||
y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
BLASLONG i = 0;
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* va1 = (__vector float*) a1;
|
||||
__vector float* va2 = (__vector float*) a2;
|
||||
__vector float* va3 = (__vector float*) a3;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
register __vector float temp0 = {0,0,0,0};
|
||||
register __vector float temp1 = {0,0,0,0};
|
||||
register __vector float temp2 = {0,0,0,0};
|
||||
register __vector float temp3 = {0,0,0,0};
|
||||
|
||||
for (i = 0; i < n / 4; i +=2) {
|
||||
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
|
||||
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1];
|
||||
temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1];
|
||||
temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1];
|
||||
}
|
||||
|
||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
|
||||
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
|
||||
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
|
||||
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* va1 = (__vector float*) a1;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
__vector float temp0 = {0,0,0,0};
|
||||
__vector float temp1 = {0,0,0,0};
|
||||
for (i = 0; i < n / 4; i +=2) {
|
||||
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
|
||||
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1];
|
||||
}
|
||||
|
||||
|
||||
|
||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
|
||||
y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
|
||||
}
|
||||
|
||||
static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *a0;
|
||||
a0 = ap;
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
__vector float temp0 = {0,0,0,0};
|
||||
for (i = 0; i < n / 4; i +=2) {
|
||||
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
|
||||
}
|
||||
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++) {
|
||||
*dest++ = *src;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT ybuffer[8], *xbuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
||||
xbuffer = buffer;
|
||||
|
||||
n1 = n >> 3;
|
||||
n2 = n & 7;
|
||||
|
||||
m3 = m & 7;
|
||||
m1 = m - m3;
|
||||
m2 = (m & (NBMAX - 1)) - m3;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while (NB == NBMAX) {
|
||||
|
||||
m1 -= NB;
|
||||
if (m1 < 0) {
|
||||
if (m2 == 0) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
y_ptr = y;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
if (inc_x != 1)
|
||||
copy_x(NB, x_ptr, xbuffer, inc_x);
|
||||
else
|
||||
xbuffer = x_ptr;
|
||||
|
||||
BLASLONG lda8 = lda << 3;
|
||||
|
||||
|
||||
if (inc_y == 1) {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
|
||||
sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
|
||||
|
||||
y_ptr += 8;
|
||||
a_ptr += lda8;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
ybuffer[0] = 0;
|
||||
ybuffer[1] = 0;
|
||||
ybuffer[2] = 0;
|
||||
ybuffer[3] = 0;
|
||||
ybuffer[4] = 0;
|
||||
ybuffer[5] = 0;
|
||||
ybuffer[6] = 0;
|
||||
ybuffer[7] = 0;
|
||||
sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
|
||||
|
||||
|
||||
|
||||
*y_ptr += ybuffer[0];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[2];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[3];
|
||||
y_ptr += inc_y;
|
||||
|
||||
*y_ptr += ybuffer[4];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[5];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[6];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[7];
|
||||
y_ptr += inc_y;
|
||||
|
||||
a_ptr += lda8;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (n2 & 4) {
|
||||
ybuffer[0] = 0;
|
||||
ybuffer[1] = 0;
|
||||
ybuffer[2] = 0;
|
||||
ybuffer[3] = 0;
|
||||
sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
|
||||
|
||||
a_ptr += lda<<2;
|
||||
|
||||
*y_ptr += ybuffer[0];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[2];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[3];
|
||||
y_ptr += inc_y;
|
||||
}
|
||||
|
||||
if (n2 & 2) {
|
||||
sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
|
||||
a_ptr += lda << 1;
|
||||
y_ptr += 2 * inc_y;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 1) {
|
||||
sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha);
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
a += NB;
|
||||
x += NB * inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
if (m3 == 0) return (0);
|
||||
|
||||
x_ptr = x;
|
||||
a_ptr = a;
|
||||
if (m3 & 4) {
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp2 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp3 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
if (lda == 4 && inc_y == 1) {
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3;
|
||||
y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3;
|
||||
y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3;
|
||||
y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3;
|
||||
aj += 16;
|
||||
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3;
|
||||
aj += 4;
|
||||
}
|
||||
|
||||
} else if (inc_y == 1) {
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3;
|
||||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3;
|
||||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3;
|
||||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (j = 0; j < n; j++) {
|
||||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
if (m3==4) return (0);
|
||||
a_ptr += 4;
|
||||
}
|
||||
|
||||
if (m3 & 2 ) {
|
||||
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if (lda == 2 && inc_y == 1) {
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
|
||||
y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
|
||||
y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
|
||||
y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
|
||||
aj += 8;
|
||||
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
|
||||
aj += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
if (inc_y == 1) {
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
|
||||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
|
||||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
|
||||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
for (j = 0; j < n; j++) {
|
||||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
if (m3==2) return (0);
|
||||
a_ptr += 2;
|
||||
}
|
||||
if (m3 & 1) {
|
||||
|
||||
FLOAT xtemp = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
if (lda == 1 && inc_y == 1) {
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
y_ptr[j + 1] += aj[j + 1] * xtemp;
|
||||
y_ptr[j + 2] += aj[j + 2] * xtemp;
|
||||
y_ptr[j + 3] += aj[j + 3] * xtemp;
|
||||
}
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
if (inc_y == 1) {
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
y_ptr[j + 1] += *(aj + lda) * xtemp;
|
||||
y_ptr[j + 2] += *(aj + lda2) * xtemp;
|
||||
y_ptr[j + 3] += *(aj + lda3) * xtemp;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
for (j = 0; j < n; j++) {
|
||||
*y_ptr += *aj * xtemp;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
a_ptr += 1;
|
||||
}
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
|
@ -389,20 +389,14 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
|||
register __vector double va0_2 = vptr_a0[i + 2];
|
||||
register __vector double va0_3 = vptr_a0[i + 3];
|
||||
|
||||
vy_0 += va0*vx0_r;
|
||||
vy_1 += va0_1*vx0_r;
|
||||
vy_2 += va0_2*vx0_r;
|
||||
vy_3 += va0_3*vx0_r;
|
||||
|
||||
va0 = vec_xxpermdi(va0, va0, 2);
|
||||
va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
|
||||
va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
|
||||
va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
|
||||
|
||||
vy_0 += va0*vx0_i;
|
||||
vy_1 += va0_1*vx0_i;
|
||||
vy_2 += va0_2*vx0_i;
|
||||
vy_3 += va0_3*vx0_i;
|
||||
register __vector double va0x = vec_xxpermdi(va0, va0, 2);
|
||||
register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2);
|
||||
register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2);
|
||||
register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2);
|
||||
vy_0 += va0*vx0_r + va0x*vx0_i;
|
||||
vy_1 += va0_1*vx0_r + va0x_1*vx0_i;
|
||||
vy_2 += va0_2*vx0_r + va0x_2*vx0_i;
|
||||
vy_3 += va0_3*vx0_r + va0x_3*vx0_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
|
|
|
@ -59,11 +59,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
|||
i = 0;
|
||||
n = n << 1;
|
||||
while (i < n) {
|
||||
// __builtin_prefetch(&x[i]);
|
||||
// __builtin_prefetch(&a0[i]);
|
||||
// __builtin_prefetch(&a1[i]);
|
||||
// __builtin_prefetch(&a2[i]);
|
||||
// __builtin_prefetch(&a3[i]);
|
||||
|
||||
register __vector double vx_0 = *(__vector double*) (&x[i]);
|
||||
register __vector double vx_1 = *(__vector double*) (&x[i + 2]);
|
||||
register __vector double vx_2 = *(__vector double*) (&x[i + 4]);
|
||||
|
|
|
@ -81,7 +81,7 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
|
|||
|
||||
"vfmaxdb %%v16,%%v16,%%v17,8 \n\t"
|
||||
|
||||
"vfmaxdb %%v0,%%v0,%%16,8 \n\t"
|
||||
"vfmaxdb %%v0,%%v0,%%v16,8 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
|
|
@ -81,7 +81,7 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
|
|||
|
||||
"vfmindb %%v16,%%v16,%%v17,8 \n\t"
|
||||
|
||||
"vfmindb %%v0,%%v0,%%16,8 \n\t"
|
||||
"vfmindb %%v0,%%v0,%%v16,8 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
|
|
@ -74,7 +74,7 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
|
|||
|
||||
"vfmaxdb %%v16,%%v16,%%v17,0 \n\t"
|
||||
|
||||
"vfmaxdb %%v0,%%v0,%%16,0 \n\t"
|
||||
"vfmaxdb %%v0,%%v0,%%v16,0 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
|
|
@ -74,7 +74,7 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
|
|||
|
||||
"vfmindb %%v16,%%v16,%%v17,0 \n\t"
|
||||
|
||||
"vfmindb %%v0,%%v0,%%16,0 \n\t"
|
||||
"vfmindb %%v0,%%v0,%%v16,0 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
|
|
@ -132,7 +132,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
|||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[i] * x[i] ;
|
||||
dot += (double) y[i] * (double) x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
@ -146,7 +146,8 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
|||
while(i < n1)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
|
||||
dot += (double) y[iy] * (double) x[ix];
|
||||
dot += (double) y[iy+inc_y] * (double) x[ix+inc_x];
|
||||
ix += inc_x*2 ;
|
||||
iy += inc_y*2 ;
|
||||
i+=2 ;
|
||||
|
@ -156,7 +157,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
|||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] ;
|
||||
dot += (double) y[iy] * (double) x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
|
|
@ -81,7 +81,7 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x)
|
|||
|
||||
"vfmaxsb %%v16,%%v16,%%v17,8 \n\t"
|
||||
|
||||
"vfmaxsb %%v0,%%v0,%%16,8 \n\t"
|
||||
"vfmaxsb %%v0,%%v0,%%v16,8 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
|
|
@ -81,7 +81,7 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x)
|
|||
|
||||
"vfminsb %%v16,%%v16,%%v17,8 \n\t"
|
||||
|
||||
"vfminsb %%v0,%%v0,%%16,8 \n\t"
|
||||
"vfminsb %%v0,%%v0,%%v16,8 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
|
|
@ -158,32 +158,24 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepf %%v4,%%v0,1 \n\t"
|
||||
"aebr %%f0,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v0,2 \n\t"
|
||||
"aebr %%f0,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v0,3 \n\t"
|
||||
"veslg %%v4,%%v0,32 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v4 \n\t"
|
||||
"vrepg %%v4,%%v0,1 \n\t"
|
||||
"aebr %%f0,%%f4 \n\t"
|
||||
"ste %%f0,0(%6) \n\t"
|
||||
"vrepf %%v4,%%v1,1 \n\t"
|
||||
"aebr %%f1,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v1,2 \n\t"
|
||||
"aebr %%f1,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v1,3 \n\t"
|
||||
"veslg %%v4,%%v1,32 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v4 \n\t"
|
||||
"vrepg %%v4,%%v1,1 \n\t"
|
||||
"aebr %%f1,%%f4 \n\t"
|
||||
"ste %%f1,4(%6) \n\t"
|
||||
"vrepf %%v4,%%v2,1 \n\t"
|
||||
"aebr %%f2,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v2,2 \n\t"
|
||||
"aebr %%f2,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v2,3 \n\t"
|
||||
"veslg %%v4,%%v2,32 \n\t"
|
||||
"vfasb %%v2,%%v2,%%v4 \n\t"
|
||||
"vrepg %%v4,%%v2,1 \n\t"
|
||||
"aebr %%f2,%%f4 \n\t"
|
||||
"ste %%f2,8(%6) \n\t"
|
||||
"vrepf %%v4,%%v3,1 \n\t"
|
||||
"aebr %%f3,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v3,2 \n\t"
|
||||
"aebr %%f3,%%f4 \n\t"
|
||||
"vrepf %%v4,%%v3,3 \n\t"
|
||||
"veslg %%v4,%%v3,32 \n\t"
|
||||
"vfasb %%v3,%%v3,%%v4 \n\t"
|
||||
"vrepg %%v4,%%v3,1 \n\t"
|
||||
"aebr %%f3,%%f4 \n\t"
|
||||
"ste %%f3,12(%6) "
|
||||
:
|
||||
|
@ -281,18 +273,14 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepf %%v2,%%v0,1 \n\t"
|
||||
"aebr %%f0,%%f2 \n\t"
|
||||
"vrepf %%v2,%%v0,2 \n\t"
|
||||
"aebr %%f0,%%f2 \n\t"
|
||||
"vrepf %%v2,%%v0,3 \n\t"
|
||||
"veslg %%v2,%%v0,32 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v2 \n\t"
|
||||
"vrepg %%v2,%%v0,1 \n\t"
|
||||
"aebr %%f0,%%f2 \n\t"
|
||||
"ste %%f0,0(%4) \n\t"
|
||||
"vrepf %%v2,%%v1,1 \n\t"
|
||||
"aebr %%f1,%%f2 \n\t"
|
||||
"vrepf %%v2,%%v1,2 \n\t"
|
||||
"aebr %%f1,%%f2 \n\t"
|
||||
"vrepf %%v2,%%v1,3 \n\t"
|
||||
"veslg %%v2,%%v1,32 \n\t"
|
||||
"vfasb %%v1,%%v1,%%v2 \n\t"
|
||||
"vrepg %%v2,%%v1,1 \n\t"
|
||||
"aebr %%f1,%%f2 \n\t"
|
||||
"ste %%f1,4(%4) "
|
||||
:
|
||||
|
@ -349,7 +337,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
|
|||
|
||||
"vl %%v31,112(%%r1,%1) \n\t"
|
||||
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
|
||||
|
||||
|
||||
"agfi %%r1,128 \n\t"
|
||||
"brctg %%r0,0b \n\t"
|
||||
|
||||
|
@ -370,11 +358,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
|
|||
"brctg %%r0,2b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vrepf %%v1,%%v0,1 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"vrepf %%v1,%%v0,2 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"vrepf %%v1,%%v0,3 \n\t"
|
||||
"veslg %%v1,%%v0,32 \n\t"
|
||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"aebr %%f0,%%f1 \n\t"
|
||||
"ste %%f0,0(%3) "
|
||||
:
|
||||
|
@ -823,5 +809,3 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
|||
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -74,7 +74,7 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x)
|
|||
|
||||
"vfmaxsb %%v16,%%v16,%%v17,0 \n\t"
|
||||
|
||||
"vfmaxsb %%v0,%%v0,%%16,0 \n\t"
|
||||
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
|
|
@ -74,7 +74,7 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x)
|
|||
|
||||
"vfminsb %%v16,%%v16,%%v17,0 \n\t"
|
||||
|
||||
"vfminsb %%v0,%%v0,%%16,0 \n\t"
|
||||
"vfminsb %%v0,%%v0,%%v16,0 \n\t"
|
||||
|
||||
"agfi %%r1, 256 \n\t"
|
||||
"brctg %%r0, 0b \n\t"
|
||||
|
|
40
param.h
40
param.h
|
@ -2915,6 +2915,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
|
||||
#if defined(Z14)
|
||||
#define SNUMOPT 2
|
||||
#define DNUMOPT 2
|
||||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 4
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define SGEMM_DEFAULT_P 456
|
||||
#define DGEMM_DEFAULT_P 320
|
||||
#define CGEMM_DEFAULT_P 480
|
||||
#define ZGEMM_DEFAULT_P 224
|
||||
|
||||
#define SGEMM_DEFAULT_Q 488
|
||||
#define DGEMM_DEFAULT_Q 384
|
||||
#define CGEMM_DEFAULT_Q 128
|
||||
#define ZGEMM_DEFAULT_Q 352
|
||||
|
||||
#define SGEMM_DEFAULT_R 8192
|
||||
#define DGEMM_DEFAULT_R 4096
|
||||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 2048
|
||||
|
||||
|
||||
#define SYMV_P 16
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#ifdef GENERIC
|
||||
|
||||
|
|
|
@ -37,4 +37,3 @@ clean:
|
|||
-rm -f *.o $(UTESTBIN)
|
||||
|
||||
libs:
|
||||
|
||||
|
|
Loading…
Reference in New Issue