Merge pull request #1 from xianyi/develop

Update
This commit is contained in:
maamountki 2019-02-05 07:25:38 +02:00 committed by GitHub
commit a38aa56e76
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 3307 additions and 82 deletions

View File

@ -94,3 +94,4 @@ THUNDERX2T99
9.System Z:
ZARCH_GENERIC
Z13
Z14

View File

@ -114,7 +114,14 @@ void get_cpuconfig(void)
break;
case CPU_Z14:
printf("#define Z14\n");
printf("#define L1_DATA_SIZE 131072\n");
printf("#define L1_DATA_LINESIZE 256\n");
printf("#define L1_DATA_ASSOCIATIVE 8\n");
printf("#define L2_SIZE 4194304\n");
printf("#define L2_LINESIZE 256\n");
printf("#define L2_ASSOCIATIVE 8\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
}
}

View File

@ -1085,6 +1085,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "Z13"
#endif
#ifdef FORCE_Z14
#define FORCE
#define ARCHITECTURE "ZARCH"
#define SUBARCHITECTURE "Z14"
#define ARCHCONFIG "-DZ14 " \
"-DDTB_DEFAULT_ENTRIES=64"
#define LIBNAME "z14"
#define CORENAME "Z14"
#endif
#ifndef FORCE
#ifdef USER_TARGET

View File

@ -48,6 +48,10 @@ ifeq ($(ARCH), zarch)
USE_TRMM = 1
endif
ifeq ($(CORE), Z14)
USE_TRMM = 1
endif

View File

@ -147,14 +147,14 @@ CSWAPKERNEL = cswap.c
ZSWAPKERNEL = zswap.c
#
#SGEMVNKERNEL = ../arm/gemv_n.c
SGEMVNKERNEL = sgemv_n.c
DGEMVNKERNEL = dgemv_n.c
#CGEMVNKERNEL = ../arm/zgemv_n.c
CGEMVNKERNEL = cgemv_n.c
ZGEMVNKERNEL = zgemv_n_4.c
#
#SGEMVTKERNEL = ../arm/gemv_t.c
SGEMVTKERNEL = sgemv_t.c
DGEMVTKERNEL = dgemv_t.c
#CGEMVTKERNEL = ../arm/zgemv_t.c
CGEMVTKERNEL = cgemv_t.c
ZGEMVTKERNEL = zgemv_t_4.c

585
kernel/power/cgemv_n.c Normal file
View File

@ -0,0 +1,585 @@
/***************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "common.h"
#include <altivec.h>
#define NBMAX 1024
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
FLOAT *a0, *a1, *a2, *a3;
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]};
register __vector float vx2_r = {x[4], x[4],x[4], x[4]};
register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]};
register __vector float vx3_r = {x[6], x[6],x[6], x[6]};
register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]};
#else
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]};
register __vector float vx2_i = {x[5], x[5],x[5], x[5]};
register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};
register __vector float vx3_i = {x[7], x[7],x[7], x[7]};
#endif
register __vector float *vy = (__vector float *) y;
register __vector float *vptr_a0 = (__vector float *) a0;
register __vector float *vptr_a1 = (__vector float *) a1;
register __vector float *vptr_a2 = (__vector float *) a2;
register __vector float *vptr_a3 = (__vector float *) a3;
BLASLONG i = 0;
for (;i< n / 2; i+=2) {
register __vector float vy_0 = vy[i];
register __vector float vy_1 = vy[i + 1];
register __vector float va0 = vptr_a0[i];
register __vector float va1 = vptr_a1[i];
register __vector float va2 = vptr_a2[i];
register __vector float va3 = vptr_a3[i];
register __vector float va0_1 = vptr_a0[i + 1];
register __vector float va1_1 = vptr_a1[i + 1];
register __vector float va2_1 = vptr_a2[i + 1];
register __vector float va3_1 = vptr_a3[i + 1];
vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;
va0 = vec_perm(va0, va0,swap_mask);
va0_1 = vec_perm(va0_1, va0_1,swap_mask);
va1 = vec_perm(va1, va1,swap_mask);
va1_1 = vec_perm(va1_1, va1_1,swap_mask);
va2 = vec_perm(va2, va2,swap_mask);
va2_1 = vec_perm(va2_1, va2_1,swap_mask);
va3 = vec_perm(va3, va3,swap_mask);
va3_1 = vec_perm(va3_1, va3_1,swap_mask);
vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;
vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;
vy[i] = vy_0;
vy[i + 1] = vy_1;
}
}
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
FLOAT *a0, *a1;
a0 = ap;
a1 = ap + lda;
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]};
#else
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
#endif
register __vector float *vy = (__vector float *) y;
register __vector float *vptr_a0 = (__vector float *) a0;
register __vector float *vptr_a1 = (__vector float *) a1;
BLASLONG i = 0;
for (;i< n / 2; i+=2) {
register __vector float vy_0 = vy[i];
register __vector float vy_1 = vy[i + 1];
register __vector float va0 = vptr_a0[i];
register __vector float va1 = vptr_a1[i];
register __vector float va0_1 = vptr_a0[i + 1];
register __vector float va1_1 = vptr_a1[i + 1];
register __vector float va0x = vec_perm(va0, va0,swap_mask);
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
register __vector float va1x = vec_perm(va1, va1,swap_mask);
register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask);
vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i;
vy[i] = vy_0;
vy[i + 1] = vy_1;
}
}
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
#else
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
#endif
register __vector float *vy = (__vector float *) y;
register __vector float *vptr_a0 = (__vector float *) ap;
BLASLONG i = 0;
for (;i< n / 2; i+=2) {
register __vector float vy_0 = vy[i];
register __vector float vy_1 = vy[i + 1];
register __vector float va0 = vptr_a0[i];
register __vector float va0_1 = vptr_a0[i + 1];
register __vector float va0x = vec_perm(va0, va0,swap_mask);
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
vy_0 += va0*vx0_r + va0x*vx0_i;
vy_1 += va0_1*vx0_r + va0x_1*vx0_i;
vy[i] = vy_0;
vy[i + 1] = vy_1;
}
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
if (inc_dest != 2) {
FLOAT temp_r;
FLOAT temp_i;
for ( i=0; i<n; i++ )
{
#if !defined(XCONJ)
temp_r = alpha_r * src[0] - alpha_i * src[1];
temp_i = alpha_r * src[1] + alpha_i * src[0];
#else
temp_r = alpha_r * src[0] + alpha_i * src[1];
temp_i = -alpha_r * src[1] + alpha_i * src[0];
#endif
*dest += temp_r;
*(dest+1) += temp_i;
src+=2;
dest += inc_dest;
}
return;
} else {
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
#if !defined(XCONJ)
register __vector float valpha_r = {alpha_r, alpha_r, alpha_r, alpha_r};
register __vector float valpha_i = {-alpha_i, alpha_i, -alpha_i, alpha_i};
#else
register __vector float valpha_r = {alpha_r, -alpha_r, alpha_r, -alpha_r};
register __vector float valpha_i = {alpha_i, alpha_i, alpha_i, alpha_i};
#endif
register __vector float *vptr_src = (__vector float *) src;
register __vector float *vptr_y = (__vector float *) dest;
for (i = 0; i < n/2; i += 2 ){
register __vector float vy_0 = vptr_y[i];
register __vector float vy_1 = vptr_y[i +1];
register __vector float vsrc = vptr_src[i];
register __vector float vsrc_1 = vptr_src[i + 1];
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i;
vptr_y[i] = vy_0;
vptr_y[i+1 ] = vy_1;
}
}
return;
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
BLASLONG i;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT xbuffer[8], *ybuffer;
if (m < 1) return (0);
if (n < 1) return (0);
ybuffer = buffer;
inc_x *= 2;
inc_y *= 2;
lda *= 2;
n1 = n / 4;
n2 = n % 4;
m3 = m % 4;
m1 = m - (m % 4);
m2 = (m % NBMAX) - (m % 4);
y_ptr = y;
BLASLONG NB = NBMAX;
while (NB == NBMAX) {
m1 -= NB;
if (m1 < 0) {
if (m2 == 0) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
memset(ybuffer, 0, NB * 2*sizeof(FLOAT));
if (inc_x == 2) {
for (i = 0; i < n1; i++) {
cgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
a_ptr += lda << 2;
x_ptr += 8;
}
if (n2 & 2) {
cgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
x_ptr += 4;
a_ptr += 2 * lda;
}
if (n2 & 1) {
cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
x_ptr += 2;
a_ptr += lda;
}
} else {
for (i = 0; i < n1; i++) {
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
xbuffer[3] = x_ptr[1];
x_ptr += inc_x;
xbuffer[4] = x_ptr[0];
xbuffer[5] = x_ptr[1];
x_ptr += inc_x;
xbuffer[6] = x_ptr[0];
xbuffer[7] = x_ptr[1];
x_ptr += inc_x;
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);
a_ptr += lda << 2;
}
for (i = 0; i < n2; i++) {
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
a_ptr += lda;
}
}
add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
a += 2 * NB;
y_ptr += NB * inc_y;
}
if (m3 == 0) return (0);
if (m3 == 1) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r = 0.0;
FLOAT temp_i = 0.0;
if (lda == 2 && inc_x == 2) {
for (i = 0; i < (n & -2); i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
#endif
a_ptr += 4;
x_ptr += 4;
}
for (; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif
a_ptr += 2;
x_ptr += 2;
}
} else {
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
return (0);
}
if (m3 == 2) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
if (lda == 4 && inc_x == 2) {
for (i = 0; i < (n & -2); i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
#endif
a_ptr += 8;
x_ptr += 4;
}
for (; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif
a_ptr += 4;
x_ptr += 2;
}
} else {
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
return (0);
}
if (m3 == 3) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
FLOAT temp_r2 = 0.0;
FLOAT temp_i2 = 0.0;
if (lda == 6 && inc_x == 2) {
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif
a_ptr += 6;
x_ptr += 2;
}
} else {
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
#else
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
#endif
return (0);
}
return (0);
}

571
kernel/power/cgemv_t.c Normal file
View File

@ -0,0 +1,571 @@
/***************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 1024
#include <altivec.h>
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
FLOAT *a0, *a1, *a2, *a3;
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0};
register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0};
register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0};
register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0};
__vector float* va0 = (__vector float*) a0;
__vector float* va1 = (__vector float*) a1;
__vector float* va2 = (__vector float*) a2;
__vector float* va3 = (__vector float*) a3;
__vector float* v_x = (__vector float*) x;
for (i = 0; i < n / 2; i+=2) {
register __vector float vx_0 = v_x[i];
register __vector float vx_1 = v_x[i+1];
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];
vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1];
vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1];
vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1];
vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1];
}
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3];
register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3];
register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3];
register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3];
#else
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3];
register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3];
register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3];
register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3];
register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3];
#endif
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
#endif
}
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
FLOAT *a0, *a1;
a0 = ap;
a1 = ap + lda;
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
__vector float* va0 = (__vector float*) a0;
__vector float* va1 = (__vector float*) a1;
__vector float* v_x = (__vector float*) x;
for (i = 0; i < n / 2; i+=2) {
register __vector float vx_0 = v_x[i];
register __vector float vx_1 = v_x[i+1];
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];
}
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
#else
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3];
#endif
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
}
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
__vector float* va0 = (__vector float*) ap;
__vector float* v_x = (__vector float*) x;
for (i = 0; i < n / 2; i+=2) {
register __vector float vx_0 = v_x[i];
register __vector float vx_1 = v_x[i+1];
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
}
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
#else
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
#endif
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
#endif
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
BLASLONG i;
for (i = 0; i < n; i++) {
*dest = *src;
*(dest + 1) = *(src + 1);
dest += 2;
src += inc_src;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[8], *xbuffer;
if (m < 1) return (0);
if (n < 1) return (0);
inc_x <<= 1;
inc_y <<= 1;
lda <<= 1;
xbuffer = buffer;
n1 = n >> 2;
n2 = n & 3;
m3 = m & 3;
m1 = m - m3;
m2 = (m & (NBMAX - 1)) - m3;
BLASLONG NB = NBMAX;
while (NB == NBMAX) {
m1 -= NB;
if (m1 < 0) {
if (m2 == 0) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
if (inc_x != 2)
copy_x(NB, x_ptr, xbuffer, inc_x);
else
xbuffer = x_ptr;
if (inc_y == 2) {
for (i = 0; i < n1; i++) {
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
a_ptr += lda << 2;
y_ptr += 8;
}
if (n2 & 2) {
cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
a_ptr += lda << 1;
y_ptr += 4;
}
if (n2 & 1) {
cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
a_ptr += lda;
y_ptr += 2;
}
} else {
for (i = 0; i < n1; i++) {
memset(ybuffer, 0, sizeof (ybuffer));
cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
a_ptr += lda << 2;
y_ptr[0] += ybuffer[0];
y_ptr[1] += ybuffer[1];
y_ptr += inc_y;
y_ptr[0] += ybuffer[2];
y_ptr[1] += ybuffer[3];
y_ptr += inc_y;
y_ptr[0] += ybuffer[4];
y_ptr[1] += ybuffer[5];
y_ptr += inc_y;
y_ptr[0] += ybuffer[6];
y_ptr[1] += ybuffer[7];
y_ptr += inc_y;
}
for (i = 0; i < n2; i++) {
memset(ybuffer, 0, sizeof (ybuffer));
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
a_ptr += lda;
y_ptr[0] += ybuffer[0];
y_ptr[1] += ybuffer[1];
y_ptr += inc_y;
}
}
a += 2 * NB;
x += NB * inc_x;
}
if (m3 == 0) return (0);
x_ptr = x;
j = 0;
a_ptr = a;
y_ptr = y;
if (m3 == 3) {
FLOAT temp_r;
FLOAT temp_i;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
x_ptr += inc_x;
FLOAT x2 = x_ptr[0];
FLOAT x3 = x_ptr[1];
x_ptr += inc_x;
FLOAT x4 = x_ptr[0];
FLOAT x5 = x_ptr[1];
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return (0);
}
if (m3 == 2) {
FLOAT temp_r;
FLOAT temp_i;
FLOAT temp_r1;
FLOAT temp_i1;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
x_ptr += inc_x;
FLOAT x2 = x_ptr[0];
FLOAT x3 = x_ptr[1];
while (j < (n & -2)) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
a_ptr += lda;
y_ptr += inc_y;
j += 2;
}
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return (0);
}
if (m3 == 1) {
FLOAT temp_r;
FLOAT temp_i;
FLOAT temp_r1;
FLOAT temp_i1;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
while (j < (n & -2)) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
a_ptr += lda;
y_ptr += inc_y;
j += 2;
}
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return (0);
}
return (0);
}

View File

@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#define NBMAX 8192
#define PREFETCH 1
#define NBMAX 1024
//#define PREFETCH 1
#include <altivec.h>
#define HAVE_KERNEL4x8_ASM 1

View File

@ -36,8 +36,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code
#if !defined(USE_MASK_PERMUTATIONS)
static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){
__vector float result;
__asm__ (
"vmrgew %0,%1,%2;\n"
: "=v" (result)
: "v" (a),
"v" (b)
: );
return result;
}
static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){
__vector float result;
__asm__ (
"vmrgow %0,%1,%2;\n"
: "=v" (result)
: "v" (a),
"v" (b)
: );
return result;
}
#endif
/**
* Find maximum index
@ -51,12 +76,16 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
BLASLONG i;
#if defined(USE_MASK_PERMUTATIONS)
register __vector unsigned int static_index0 = {0,1,2,3};
#else
register __vector unsigned int static_index0 = {2,0,3,1};
#endif
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}
register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
register __vector unsigned int static_index1=static_index0 +temp0;
register __vector unsigned int static_index2=static_index0 +temp1;
register __vector unsigned int static_index3=static_index1 +temp1;
temp0=vec_xor(temp0,temp0);
temp1=temp1 <<1 ; //{16,16,16,16}
register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
@ -64,9 +93,11 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
register __vector float quadruple_values={0,0,0,0};
register __vector float * v_ptrx=(__vector float *)x;
#if defined(USE_MASK_PERMUTATIONS)
register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27};
register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
for(; i<n; i+=32){
#endif
for(; i<n; i+=32 ){
//absolute temporary complex vectors
register __vector float v0=vec_abs(v_ptrx[0]);
register __vector float v1=vec_abs(v_ptrx[1]);
@ -78,8 +109,10 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
register __vector float v7=vec_abs(v_ptrx[7]);
//pack complex real and imaginary parts together to sum real+image
#if defined(USE_MASK_PERMUTATIONS)
register __vector float t1=vec_perm(v0,v1,real_pack_mask);
register __vector float ti=vec_perm(v0,v1,image_pack_mask);
v0=t1+ti; //sum quadruple real with quadruple image
register __vector float t2=vec_perm(v2,v3,real_pack_mask);
register __vector float ti2=vec_perm(v2,v3,image_pack_mask);
@ -90,6 +123,22 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
t2=vec_perm(v6,v7,real_pack_mask);
ti2=vec_perm(v6,v7,image_pack_mask);
v3=t2+ti2;
#else
register __vector float t1=mvec_mergee(v0,v1);
register __vector float ti=mvec_mergeo(v0,v1);
v0=t1+ti; //sum quadruple real with quadruple image
register __vector float t2= mvec_mergee(v2,v3);
register __vector float ti2=mvec_mergeo(v2,v3);
v1=t2+ti2;
t1=mvec_mergee(v4,v5);
ti=mvec_mergeo(v4,v5);
v2=t1+ti; //sum
t2=mvec_mergee(v6,v7);
ti2=mvec_mergeo(v6,v7);
v3=t2+ti2;
#endif
// now we have 16 summed elements . lets compare them
v_ptrx+=8;
register __vector bool int r1=vec_cmpgt(v1,v0);
@ -114,8 +163,10 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
v7=vec_abs(v_ptrx[7]);
//pack complex real and imaginary parts together to sum real+image
#if defined(USE_MASK_PERMUTATIONS)
t1=vec_perm(v0,v1,real_pack_mask);
ti=vec_perm(v0,v1,image_pack_mask);
v0=t1+ti; //sum quadruple real with quadruple image
t2=vec_perm(v2,v3,real_pack_mask);
ti2=vec_perm(v2,v3,image_pack_mask);
@ -126,6 +177,22 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
t2=vec_perm(v6,v7,real_pack_mask);
ti2=vec_perm(v6,v7,image_pack_mask);
v3=t2+ti2;
#else
t1=mvec_mergee(v0,v1);
ti=mvec_mergeo(v0,v1);
v0=t1+ti; //sum quadruple real with quadruple image
t2=mvec_mergee(v2,v3);
ti2=mvec_mergeo(v2,v3);
v1=t2+ti2;
t1=mvec_mergee(v4,v5);
ti=mvec_mergeo(v4,v5);
v2=t1+ti; //sum
t2=mvec_mergee(v6,v7);
ti2=mvec_mergeo(v6,v7);
v3=t2+ti2;
#endif
// now we have 16 summed elements {from 16 to 31} . lets compare them
v_ptrx+=8;
r1=vec_cmpgt(v1,v0);

465
kernel/power/sgemv_n.c Normal file
View File

@ -0,0 +1,465 @@
/***************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 4096
static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3;
FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
b0 = a0 + lda4 ;
b1 = a1 + lda4 ;
b2 = a2 + lda4 ;
b3 = a3 + lda4 ;
x0 = xo[0] * *alpha;
x1 = xo[1] * *alpha;
x2 = xo[2] * *alpha;
x3 = xo[3] * *alpha;
x4 = xo[4] * *alpha;
x5 = xo[5] * *alpha;
x6 = xo[6] * *alpha;
x7 = xo[7] * *alpha;
__vector float* va0 = (__vector float*)a0;
__vector float* va1 = (__vector float*)a1;
__vector float* va2 = (__vector float*)a2;
__vector float* va3 = (__vector float*)a3;
__vector float* vb0 = (__vector float*)b0;
__vector float* vb1 = (__vector float*)b1;
__vector float* vb2 = (__vector float*)b2;
__vector float* vb3 = (__vector float*)b3;
__vector float v_x0 = {x0,x0,x0,x0};
__vector float v_x1 = {x1,x1,x1,x1};
__vector float v_x2 = {x2,x2,x2,x2};
__vector float v_x3 = {x3,x3,x3,x3};
__vector float v_x4 = {x4,x4,x4,x4};
__vector float v_x5 = {x5,x5,x5,x5};
__vector float v_x6 = {x6,x6,x6,x6};
__vector float v_x7 = {x7,x7,x7,x7};
__vector float* v_y =(__vector float*)y;
for ( i=0; i< n/4; i++)
{
register __vector float vy=v_y[i];
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ;
vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ;
v_y[i] =vy;
}
}
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0,x1,x2,x3;
x0 = xo[0] * *alpha;
x1 = xo[1] * *alpha;
x2 = xo[2] * *alpha;
x3 = xo[3] * *alpha;
__vector float v_x0 = {x0,x0,x0,x0};
__vector float v_x1 = {x1,x1,x1,x1};
__vector float v_x2 = {x2,x2,x2,x2};
__vector float v_x3 = {x3,x3,x3,x3};
__vector float* v_y =(__vector float*)y;
__vector float* va0 = (__vector float*)ap[0];
__vector float* va1 = (__vector float*)ap[1];
__vector float* va2 = (__vector float*)ap[2];
__vector float* va3 = (__vector float*)ap[3];
for ( i=0; i< n/4; i++ )
{
register __vector float vy=v_y[i];
vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ;
v_y[i] =vy;
}
}
static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0,x1;
x0 = x[0] * *alpha;
x1 = x[1] * *alpha;
__vector float v_x0 = {x0,x0,x0,x0};
__vector float v_x1 = {x1,x1,x1,x1};
__vector float* v_y =(__vector float*)y;
__vector float* va0 = (__vector float*)ap[0];
__vector float* va1 = (__vector float*)ap[1];
for ( i=0; i< n/4; i++ )
{
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ;
}
}
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0 ;
x0 = x[0] * *alpha;
__vector float v_x0 = {x0,x0,x0,x0};
__vector float* v_y =(__vector float*)y;
__vector float* va0 = (__vector float*)ap;
for ( i=0; i< n/4; i++ )
{
v_y[i] += v_x0 * va0[i] ;
}
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
for ( i=0; i<n; i++ ){
*dest += *src;
src++;
dest += inc_dest;
}
return;
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
BLASLONG lda8 = lda << 3;
FLOAT xbuffer[8],*ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
if ( inc_x == 1 )
{
n1 = n >> 3 ;
n2 = n & 7 ;
}
else
{
n1 = n >> 2 ;
n2 = n & 3 ;
}
m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_y != 1 )
memset(ybuffer,0,NB*4);
else
ybuffer = y_ptr;
if ( inc_x == 1 )
{
for( i = 0; i < n1 ; i++)
{
sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
ap[0] += lda8;
ap[1] += lda8;
ap[2] += lda8;
ap[3] += lda8;
a_ptr += lda8;
x_ptr += 8;
}
if ( n2 & 4 )
{
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
x_ptr += 4;
}
if ( n2 & 2 )
{
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
a_ptr += lda*2;
x_ptr += 2;
}
if ( n2 & 1 )
{
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
a_ptr += lda;
x_ptr += 1;
}
}
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
a_ptr += lda;
}
}
a += NB;
if ( inc_y != 1 )
{
add_y(NB,ybuffer,y_ptr,inc_y);
y_ptr += NB * inc_y;
}
else
y_ptr += NB ;
}
if ( m3 == 0 ) return(0);
if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 )
{
for( i = 0; i < ( n & -4 ); i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
a_ptr += 12;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += 3;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
y_ptr[0] += alpha * temp2;
return(0);
}
if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{
for( i = 0; i < (n & -4) ; i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
a_ptr += 8;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
return(0);
}
if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{
for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
}
for( ; i < n; i++ )
{
temp += a_ptr[i] * x_ptr[i];
}
}
else
{
for( i = 0; i < n; i++ )
{
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp;
return(0);
}
return(0);
}

513
kernel/power/sgemv_n_8.c Normal file
View File

@ -0,0 +1,513 @@
/***************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/****Note***
UnUsed kernel
This kernel works. But it was not competitive enough to be added in production
It could be used and tested in future or could provide barebone for switching to inline assembly
*/
#include "common.h"
#define NBMAX 4096
static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3;
FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
b0 = a0 + lda4 ;
b1 = a1 + lda4 ;
b2 = a2 + lda4 ;
b3 = a3 + lda4 ;
x0 = xo[0] * *alpha;
x1 = xo[1] * *alpha;
x2 = xo[2] * *alpha;
x3 = xo[3] * *alpha;
x4 = xo[4] * *alpha;
x5 = xo[5] * *alpha;
x6 = xo[6] * *alpha;
x7 = xo[7] * *alpha;
__vector float* va0 = (__vector float*)a0;
__vector float* va1 = (__vector float*)a1;
__vector float* va2 = (__vector float*)a2;
__vector float* va3 = (__vector float*)a3;
__vector float* vb0 = (__vector float*)b0;
__vector float* vb1 = (__vector float*)b1;
__vector float* vb2 = (__vector float*)b2;
__vector float* vb3 = (__vector float*)b3;
register __vector float v_x0 = {x0,x0,x0,x0};
register __vector float v_x1 = {x1,x1,x1,x1};
register __vector float v_x2 = {x2,x2,x2,x2};
register __vector float v_x3 = {x3,x3,x3,x3};
register __vector float v_x4 = {x4,x4,x4,x4};
register __vector float v_x5 = {x5,x5,x5,x5};
register __vector float v_x6 = {x6,x6,x6,x6};
register __vector float v_x7 = {x7,x7,x7,x7};
__vector float* v_y =(__vector float*)y;
for ( i=0; i< n/4; i+=2)
{
register __vector float vy_1=v_y[i];
register __vector float vy_2=v_y[i+1];
register __vector float va0_1=va0[i] ;
register __vector float va0_2=va0[i+1] ;
register __vector float va1_1=va1[i] ;
register __vector float va1_2=va1[i+1] ;
register __vector float va2_1=va2[i] ;
register __vector float va2_2=va2[i+1] ;
register __vector float va3_1=va3[i] ;
register __vector float va3_2=va3[i+1] ;
register __vector float vb0_1=vb0[i] ;
register __vector float vb0_2=vb0[i+1] ;
register __vector float vb1_1=vb1[i] ;
register __vector float vb1_2=vb1[i+1] ;
register __vector float vb2_1=vb2[i] ;
register __vector float vb2_2=vb2[i+1] ;
register __vector float vb3_1=vb3[i] ;
register __vector float vb3_2=vb3[i+1] ;
vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ;
vy_1 += v_x4 * vb0_1 + v_x5 * vb1_1 + v_x6 * vb2_1 + v_x7 * vb3_1 ;
vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ;
vy_2 += v_x4 * vb0_2 + v_x5 * vb1_2 + v_x6 * vb2_2 + v_x7 * vb3_2 ;
v_y[i] =vy_1;
v_y[i+1] =vy_2;
}
}
static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0,x1,x2,x3;
x0 = xo[0] * *alpha;
x1 = xo[1] * *alpha;
x2 = xo[2] * *alpha;
x3 = xo[3] * *alpha;
__vector float v_x0 = {x0,x0,x0,x0};
__vector float v_x1 = {x1,x1,x1,x1};
__vector float v_x2 = {x2,x2,x2,x2};
__vector float v_x3 = {x3,x3,x3,x3};
__vector float* v_y =(__vector float*)y;
__vector float* va0 = (__vector float*)ap[0];
__vector float* va1 = (__vector float*)ap[1];
__vector float* va2 = (__vector float*)ap[2];
__vector float* va3 = (__vector float*)ap[3];
for ( i=0; i< n/4; i+=2 )
{
register __vector float vy_1=v_y[i];
register __vector float vy_2=v_y[i+1];
register __vector float va0_1=va0[i] ;
register __vector float va0_2=va0[i+1] ;
register __vector float va1_1=va1[i] ;
register __vector float va1_2=va1[i+1] ;
register __vector float va2_1=va2[i] ;
register __vector float va2_2=va2[i+1] ;
register __vector float va3_1=va3[i] ;
register __vector float va3_2=va3[i+1] ;
vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ;
vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ;
v_y[i] =vy_1;
v_y[i+1] =vy_2;
}
}
static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0,x1;
x0 = x[0] * *alpha;
x1 = x[1] * *alpha;
__vector float v_x0 = {x0,x0,x0,x0};
__vector float v_x1 = {x1,x1,x1,x1};
__vector float* v_y =(__vector float*)y;
__vector float* va0 = (__vector float*)ap[0];
__vector float* va1 = (__vector float*)ap[1];
for ( i=0; i< n/4; i+=2 )
{
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ;
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ;
}
}
static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0 ;
x0 = x[0] * *alpha;
__vector float v_x0 = {x0,x0,x0,x0};
__vector float* v_y =(__vector float*)y;
__vector float* va0 = (__vector float*)ap;
for ( i=0; i< n/4; i+=2 )
{
v_y[i] += v_x0 * va0[i] ;
v_y[i+1] += v_x0 * va0[i+1] ;
}
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
for ( i=0; i<n; i++ ){
*dest += *src;
src++;
dest += inc_dest;
}
return;
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
BLASLONG lda8 = lda << 3;
FLOAT xbuffer[8],*ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
ybuffer = buffer;
if ( inc_x == 1 )
{
n1 = n >> 3 ;
n2 = n & 7 ;
}
else
{
n1 = n >> 2 ;
n2 = n & 3 ;
}
m3 = m & 7 ;
m1 = m - m3;
m2 = (m & (NBMAX-1)) - m3 ;
y_ptr = y;
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;
if ( inc_y != 1 )
memset(ybuffer,0,NB*4);
else
ybuffer = y_ptr;
if ( inc_x == 1 )
{
for( i = 0; i < n1 ; i++)
{
sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
ap[0] += lda8;
ap[1] += lda8;
ap[2] += lda8;
ap[3] += lda8;
a_ptr += lda8;
x_ptr += 8;
}
if ( n2 & 4 )
{
sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
x_ptr += 4;
}
if ( n2 & 2 )
{
sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha);
a_ptr += lda*2;
x_ptr += 2;
}
if ( n2 & 1 )
{
sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
a_ptr += lda;
x_ptr += 1;
}
}
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
a_ptr += lda;
}
}
a += NB;
if ( inc_y != 1 )
{
add_y(NB,ybuffer,y_ptr,inc_y);
y_ptr += NB * inc_y;
}
else
y_ptr += NB ;
}
if ( m3 & 4 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
FLOAT temp3 = 0.0;
if ( lda == 4 && inc_x ==1 )
{
for( i = 0; i < ( n & -4 ); i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1];
temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1];
temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12] * x_ptr[3];
temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3];
temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3];
temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3];
a_ptr += 16;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
temp3 += a_ptr[3] * x_ptr[0] ;
a_ptr +=4;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
temp3 += a_ptr[3] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
y_ptr[0] += alpha * temp2;
y_ptr += inc_y;
y_ptr[0] += alpha * temp3;
y_ptr += inc_y;
a += 4;
}
if ( m3 & 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{
for( i = 0; i < (n & -4) ; i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
a_ptr += 8;
x_ptr += 4;
}
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr ++;
}
}
else
{
for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
a += 2;
}
if ( m3 & 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{
for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
}
for( ; i < n; i++ )
{
temp += a_ptr[i] * x_ptr[i];
}
}
else
{
for( i = 0; i < n; i++ )
{
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
y_ptr[0] += alpha * temp;
}
return(0);
}

480
kernel/power/sgemv_t.c Normal file
View File

@ -0,0 +1,480 @@
/***************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 2048
#include <altivec.h>
static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
BLASLONG i;
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
__vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
register __vector float temp0 = {0,0,0,0};
register __vector float temp1 = {0,0,0,0};
register __vector float temp2 = {0,0,0,0};
register __vector float temp3 = {0,0,0,0};
register __vector float temp4 = {0,0,0,0};
register __vector float temp5 = {0,0,0,0};
register __vector float temp6 = {0,0,0,0};
register __vector float temp7 = {0,0,0,0};
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
a4 = a3 + lda;
a5 = a4 + lda;
a6 = a5 + lda;
a7 = a6 + lda;
va0 = (__vector float*) a0;
va1 = (__vector float*) a1;
va2 = (__vector float*) a2;
va3 = (__vector float*) a3;
va4 = (__vector float*) a4;
va5 = (__vector float*) a5;
va6 = (__vector float*) a6;
va7 = (__vector float*) a7;
v_x = (__vector float*) x;
for (i = 0; i < n/4; i ++) {
temp0 += v_x[i] * va0[i];
temp1 += v_x[i] * va1[i];
temp2 += v_x[i] * va2[i];
temp3 += v_x[i] * va3[i];
temp4 += v_x[i] * va4[i];
temp5 += v_x[i] * va5[i];
temp6 += v_x[i] * va6[i];
temp7 += v_x[i] * va7[i];
}
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
}
static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
BLASLONG i = 0;
FLOAT *a0, *a1, *a2, *a3;
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
__vector float* va0 = (__vector float*) a0;
__vector float* va1 = (__vector float*) a1;
__vector float* va2 = (__vector float*) a2;
__vector float* va3 = (__vector float*) a3;
__vector float* v_x = (__vector float*) x;
register __vector float temp0 = {0,0,0,0};
register __vector float temp1 = {0,0,0,0};
register __vector float temp2 = {0,0,0,0};
register __vector float temp3 = {0,0,0,0};
for (i = 0; i < n / 4; i ++) {
temp0 += v_x[i] * va0[i];
temp1 += v_x[i] * va1[i];
temp2 += v_x[i] * va2[i];
temp3 += v_x[i] * va3[i];
}
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
}
static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
BLASLONG i;
FLOAT *a0, *a1;
a0 = ap;
a1 = ap + lda;
__vector float* va0 = (__vector float*) a0;
__vector float* va1 = (__vector float*) a1;
__vector float* v_x = (__vector float*) x;
__vector float temp0 = {0,0,0,0};
__vector float temp1 = {0,0,0,0};
for (i = 0; i < n / 4; i ++) {
temp0 += v_x[i] * va0[i];
temp1 += v_x[i] * va1[i];
}
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
}
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
BLASLONG i;
FLOAT *a0;
a0 = ap;
__vector float* va0 = (__vector float*) a0;
__vector float* v_x = (__vector float*) x;
__vector float temp0 = {0,0,0,0};
for (i = 0; i < n / 4; i ++) {
temp0 += v_x[i] * va0[i] ;
}
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
BLASLONG i;
for (i = 0; i < n; i++) {
*dest++ = *src;
src += inc_src;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[8], *xbuffer;
if (m < 1) return (0);
if (n < 1) return (0);
xbuffer = buffer;
n1 = n >> 3;
n2 = n & 7;
m3 = m & 3;
m1 = m - m3;
m2 = (m & (NBMAX - 1)) - m3;
BLASLONG NB = NBMAX;
while (NB == NBMAX) {
m1 -= NB;
if (m1 < 0) {
if (m2 == 0) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
if (inc_x != 1)
copy_x(NB, x_ptr, xbuffer, inc_x);
else
xbuffer = x_ptr;
BLASLONG lda8 = lda << 3;
if (inc_y == 1) {
for (i = 0; i < n1; i++) {
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
y_ptr += 8;
a_ptr += lda8;
}
} else {
for (i = 0; i < n1; i++) {
ybuffer[0] = 0;
ybuffer[1] = 0;
ybuffer[2] = 0;
ybuffer[3] = 0;
ybuffer[4] = 0;
ybuffer[5] = 0;
ybuffer[6] = 0;
ybuffer[7] = 0;
sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
*y_ptr += ybuffer[0];
y_ptr += inc_y;
*y_ptr += ybuffer[1];
y_ptr += inc_y;
*y_ptr += ybuffer[2];
y_ptr += inc_y;
*y_ptr += ybuffer[3];
y_ptr += inc_y;
*y_ptr += ybuffer[4];
y_ptr += inc_y;
*y_ptr += ybuffer[5];
y_ptr += inc_y;
*y_ptr += ybuffer[6];
y_ptr += inc_y;
*y_ptr += ybuffer[7];
y_ptr += inc_y;
a_ptr += lda8;
}
}
if (n2 & 4) {
ybuffer[0] = 0;
ybuffer[1] = 0;
ybuffer[2] = 0;
ybuffer[3] = 0;
sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
a_ptr += lda<<2;
*y_ptr += ybuffer[0];
y_ptr += inc_y;
*y_ptr += ybuffer[1];
y_ptr += inc_y;
*y_ptr += ybuffer[2];
y_ptr += inc_y;
*y_ptr += ybuffer[3];
y_ptr += inc_y;
}
if (n2 & 2) {
sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
a_ptr += lda << 1;
y_ptr += 2 * inc_y;
}
if (n2 & 1) {
sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
a_ptr += lda;
y_ptr += inc_y;
}
a += NB;
x += NB * inc_x;
}
if (m3 == 0) return (0);
x_ptr = x;
a_ptr = a;
if (m3 == 3) {
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp2 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if (lda == 3 && inc_y == 1) {
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
aj += 12;
}
for (; j < n; j++) {
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
aj += 3;
}
} else {
if (inc_y == 1) {
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
aj += lda4;
}
for (; j < n; j++) {
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
aj += lda;
}
} else {
for (j = 0; j < n; j++) {
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
y_ptr += inc_y;
aj += lda;
}
}
}
return (0);
}
if (m3 == 2) {
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if (lda == 2 && inc_y == 1) {
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
aj += 8;
}
for (; j < n; j++) {
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
aj += 2;
}
} else {
if (inc_y == 1) {
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
aj += lda4;
}
for (; j < n; j++) {
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
aj += lda;
}
} else {
for (j = 0; j < n; j++) {
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
y_ptr += inc_y;
aj += lda;
}
}
}
return (0);
}
FLOAT xtemp = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if (lda == 1 && inc_y == 1) {
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += aj[j] * xtemp;
y_ptr[j + 1] += aj[j + 1] * xtemp;
y_ptr[j + 2] += aj[j + 2] * xtemp;
y_ptr[j + 3] += aj[j + 3] * xtemp;
}
for (; j < n; j++) {
y_ptr[j] += aj[j] * xtemp;
}
} else {
if (inc_y == 1) {
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += *aj * xtemp;
y_ptr[j + 1] += *(aj + lda) * xtemp;
y_ptr[j + 2] += *(aj + lda2) * xtemp;
y_ptr[j + 3] += *(aj + lda3) * xtemp;
aj += lda4;
}
for (; j < n; j++) {
y_ptr[j] += *aj * xtemp;
aj += lda;
}
} else {
for (j = 0; j < n; j++) {
*y_ptr += *aj * xtemp;
y_ptr += inc_y;
aj += lda;
}
}
}
return (0);
}

508
kernel/power/sgemv_t_8.c Normal file
View File

@ -0,0 +1,508 @@
/***************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/****Note***
UnUsed kernel
This kernel works. But it was not competitive enough to be added in production
It could be used and tested in future or could be used as base for switching to inline assembly
*/
#include "common.h"
#include <stdio.h>
#define NBMAX 4096
#include <altivec.h>
static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
BLASLONG i;
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
__vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
register __vector float temp0 = {0,0,0,0};
register __vector float temp1 = {0,0,0,0};
register __vector float temp2 = {0,0,0,0};
register __vector float temp3 = {0,0,0,0};
register __vector float temp4 = {0,0,0,0};
register __vector float temp5 = {0,0,0,0};
register __vector float temp6 = {0,0,0,0};
register __vector float temp7 = {0,0,0,0};
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
a4 = a3 + lda;
a5 = a4 + lda;
a6 = a5 + lda;
a7 = a6 + lda;
va0 = (__vector float*) a0;
va1 = (__vector float*) a1;
va2 = (__vector float*) a2;
va3 = (__vector float*) a3;
va4 = (__vector float*) a4;
va5 = (__vector float*) a5;
va6 = (__vector float*) a6;
va7 = (__vector float*) a7;
v_x = (__vector float*) x;
for (i = 0; i < n/4; i +=2) {
register __vector float vx1=v_x[i] ;
register __vector float vx2=v_x[i+1] ;
register __vector float va0_1=va0[i] ;
register __vector float va0_2=va0[i+1] ;
register __vector float va1_1=va1[i] ;
register __vector float va1_2=va1[i+1] ;
register __vector float va2_1=va2[i] ;
register __vector float va2_2=va2[i+1] ;
register __vector float va3_1=va3[i] ;
register __vector float va3_2=va3[i+1] ;
register __vector float va4_1=va4[i] ;
register __vector float va4_2=va4[i+1] ;
register __vector float va5_1=va5[i] ;
register __vector float va5_2=va5[i+1] ;
register __vector float va6_1=va6[i] ;
register __vector float va6_2=va6[i+1] ;
register __vector float va7_1=va7[i] ;
register __vector float va7_2=va7[i+1] ;
temp0 += vx1* va0_1 + vx2 * va0_2;
temp1 += vx1* va1_1 + vx2 * va1_2;
temp2 += vx1* va2_1 + vx2 * va2_2;
temp3 += vx1* va3_1 + vx2 * va3_2;
temp4 += vx1* va4_1 + vx2 * va4_2;
temp5 += vx1* va5_1 + vx2 * va5_2;
temp6 += vx1* va6_1 + vx2 * va6_2;
temp7 += vx1* va7_1 + vx2 * va7_2;
}
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
}
static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
BLASLONG i = 0;
FLOAT *a0, *a1, *a2, *a3;
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
__vector float* va0 = (__vector float*) a0;
__vector float* va1 = (__vector float*) a1;
__vector float* va2 = (__vector float*) a2;
__vector float* va3 = (__vector float*) a3;
__vector float* v_x = (__vector float*) x;
register __vector float temp0 = {0,0,0,0};
register __vector float temp1 = {0,0,0,0};
register __vector float temp2 = {0,0,0,0};
register __vector float temp3 = {0,0,0,0};
for (i = 0; i < n / 4; i +=2) {
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1];
temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1];
temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1];
}
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
}
static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
BLASLONG i;
FLOAT *a0, *a1;
a0 = ap;
a1 = ap + lda;
__vector float* va0 = (__vector float*) a0;
__vector float* va1 = (__vector float*) a1;
__vector float* v_x = (__vector float*) x;
__vector float temp0 = {0,0,0,0};
__vector float temp1 = {0,0,0,0};
for (i = 0; i < n / 4; i +=2) {
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1];
}
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
}
static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
BLASLONG i;
FLOAT *a0;
a0 = ap;
__vector float* va0 = (__vector float*) a0;
__vector float* v_x = (__vector float*) x;
__vector float temp0 = {0,0,0,0};
for (i = 0; i < n / 4; i +=2) {
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
}
y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
BLASLONG i;
for (i = 0; i < n; i++) {
*dest++ = *src;
src += inc_src;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[8], *xbuffer;
if (m < 1) return (0);
if (n < 1) return (0);
xbuffer = buffer;
n1 = n >> 3;
n2 = n & 7;
m3 = m & 7;
m1 = m - m3;
m2 = (m & (NBMAX - 1)) - m3;
BLASLONG NB = NBMAX;
while (NB == NBMAX) {
m1 -= NB;
if (m1 < 0) {
if (m2 == 0) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
if (inc_x != 1)
copy_x(NB, x_ptr, xbuffer, inc_x);
else
xbuffer = x_ptr;
BLASLONG lda8 = lda << 3;
if (inc_y == 1) {
for (i = 0; i < n1; i++) {
sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
y_ptr += 8;
a_ptr += lda8;
}
} else {
for (i = 0; i < n1; i++) {
ybuffer[0] = 0;
ybuffer[1] = 0;
ybuffer[2] = 0;
ybuffer[3] = 0;
ybuffer[4] = 0;
ybuffer[5] = 0;
ybuffer[6] = 0;
ybuffer[7] = 0;
sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
*y_ptr += ybuffer[0];
y_ptr += inc_y;
*y_ptr += ybuffer[1];
y_ptr += inc_y;
*y_ptr += ybuffer[2];
y_ptr += inc_y;
*y_ptr += ybuffer[3];
y_ptr += inc_y;
*y_ptr += ybuffer[4];
y_ptr += inc_y;
*y_ptr += ybuffer[5];
y_ptr += inc_y;
*y_ptr += ybuffer[6];
y_ptr += inc_y;
*y_ptr += ybuffer[7];
y_ptr += inc_y;
a_ptr += lda8;
}
}
if (n2 & 4) {
ybuffer[0] = 0;
ybuffer[1] = 0;
ybuffer[2] = 0;
ybuffer[3] = 0;
sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
a_ptr += lda<<2;
*y_ptr += ybuffer[0];
y_ptr += inc_y;
*y_ptr += ybuffer[1];
y_ptr += inc_y;
*y_ptr += ybuffer[2];
y_ptr += inc_y;
*y_ptr += ybuffer[3];
y_ptr += inc_y;
}
if (n2 & 2) {
sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
a_ptr += lda << 1;
y_ptr += 2 * inc_y;
}
if (n2 & 1) {
sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha);
a_ptr += lda;
y_ptr += inc_y;
}
a += NB;
x += NB * inc_x;
}
if (m3 == 0) return (0);
x_ptr = x;
a_ptr = a;
if (m3 & 4) {
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp2 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp3 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT *aj = a_ptr;
y_ptr = y;
if (lda == 4 && inc_y == 1) {
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3;
y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3;
y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3;
y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3;
aj += 16;
}
for (; j < n; j++) {
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3;
aj += 4;
}
} else if (inc_y == 1) {
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3;
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3;
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3;
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3;
aj += lda4;
}
for (; j < n; j++) {
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3;
aj += lda;
}
} else {
for (j = 0; j < n; j++) {
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3;
y_ptr += inc_y;
aj += lda;
}
}
if (m3==4) return (0);
a_ptr += 4;
}
if (m3 & 2 ) {
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT *aj = a_ptr;
y_ptr = y;
if (lda == 2 && inc_y == 1) {
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
aj += 8;
}
for (; j < n; j++) {
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
aj += 2;
}
} else {
if (inc_y == 1) {
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
aj += lda4;
}
for (; j < n; j++) {
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
aj += lda;
}
} else {
for (j = 0; j < n; j++) {
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
y_ptr += inc_y;
aj += lda;
}
}
}
if (m3==2) return (0);
a_ptr += 2;
}
if (m3 & 1) {
FLOAT xtemp = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT *aj = a_ptr;
y_ptr = y;
if (lda == 1 && inc_y == 1) {
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += aj[j] * xtemp;
y_ptr[j + 1] += aj[j + 1] * xtemp;
y_ptr[j + 2] += aj[j + 2] * xtemp;
y_ptr[j + 3] += aj[j + 3] * xtemp;
}
for (; j < n; j++) {
y_ptr[j] += aj[j] * xtemp;
}
} else {
if (inc_y == 1) {
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += *aj * xtemp;
y_ptr[j + 1] += *(aj + lda) * xtemp;
y_ptr[j + 2] += *(aj + lda2) * xtemp;
y_ptr[j + 3] += *(aj + lda3) * xtemp;
aj += lda4;
}
for (; j < n; j++) {
y_ptr[j] += *aj * xtemp;
aj += lda;
}
} else {
for (j = 0; j < n; j++) {
*y_ptr += *aj * xtemp;
y_ptr += inc_y;
aj += lda;
}
}
}
a_ptr += 1;
}
return (0);
}

View File

@ -389,20 +389,14 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
register __vector double va0_2 = vptr_a0[i + 2];
register __vector double va0_3 = vptr_a0[i + 3];
vy_0 += va0*vx0_r;
vy_1 += va0_1*vx0_r;
vy_2 += va0_2*vx0_r;
vy_3 += va0_3*vx0_r;
va0 = vec_xxpermdi(va0, va0, 2);
va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
vy_0 += va0*vx0_i;
vy_1 += va0_1*vx0_i;
vy_2 += va0_2*vx0_i;
vy_3 += va0_3*vx0_i;
register __vector double va0x = vec_xxpermdi(va0, va0, 2);
register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2);
register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2);
register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2);
vy_0 += va0*vx0_r + va0x*vx0_i;
vy_1 += va0_1*vx0_r + va0x_1*vx0_i;
vy_2 += va0_2*vx0_r + va0x_2*vx0_i;
vy_3 += va0_3*vx0_r + va0x_3*vx0_i;
vy[i] = vy_0;
vy[i + 1] = vy_1;

View File

@ -59,11 +59,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
i = 0;
n = n << 1;
while (i < n) {
// __builtin_prefetch(&x[i]);
// __builtin_prefetch(&a0[i]);
// __builtin_prefetch(&a1[i]);
// __builtin_prefetch(&a2[i]);
// __builtin_prefetch(&a3[i]);
register __vector double vx_0 = *(__vector double*) (&x[i]);
register __vector double vx_1 = *(__vector double*) (&x[i + 2]);
register __vector double vx_2 = *(__vector double*) (&x[i + 4]);

View File

@ -81,7 +81,7 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
"vfmaxdb %%v16,%%v16,%%v17,8 \n\t"
"vfmaxdb %%v0,%%v0,%%16,8 \n\t"
"vfmaxdb %%v0,%%v0,%%v16,8 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

View File

@ -81,7 +81,7 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
"vfmindb %%v16,%%v16,%%v17,8 \n\t"
"vfmindb %%v0,%%v0,%%16,8 \n\t"
"vfmindb %%v0,%%v0,%%v16,8 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

View File

@ -74,7 +74,7 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
"vfmaxdb %%v16,%%v16,%%v17,0 \n\t"
"vfmaxdb %%v0,%%v0,%%16,0 \n\t"
"vfmaxdb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

View File

@ -74,7 +74,7 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
"vfmindb %%v16,%%v16,%%v17,0 \n\t"
"vfmindb %%v0,%%v0,%%16,0 \n\t"
"vfmindb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

View File

@ -132,7 +132,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
while(i < n)
{
dot += y[i] * x[i] ;
dot += (double) y[i] * (double) x[i] ;
i++ ;
}
@ -146,7 +146,8 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
while(i < n1)
{
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
dot += (double) y[iy] * (double) x[ix];
dot += (double) y[iy+inc_y] * (double) x[ix+inc_x];
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
@ -156,7 +157,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
while(i < n)
{
dot += y[iy] * x[ix] ;
dot += (double) y[iy] * (double) x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;

View File

@ -81,7 +81,7 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x)
"vfmaxsb %%v16,%%v16,%%v17,8 \n\t"
"vfmaxsb %%v0,%%v0,%%16,8 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,8 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

View File

@ -81,7 +81,7 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x)
"vfminsb %%v16,%%v16,%%v17,8 \n\t"
"vfminsb %%v0,%%v0,%%16,8 \n\t"
"vfminsb %%v0,%%v0,%%v16,8 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

View File

@ -158,32 +158,24 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepf %%v4,%%v0,1 \n\t"
"aebr %%f0,%%f4 \n\t"
"vrepf %%v4,%%v0,2 \n\t"
"aebr %%f0,%%f4 \n\t"
"vrepf %%v4,%%v0,3 \n\t"
"veslg %%v4,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v4 \n\t"
"vrepg %%v4,%%v0,1 \n\t"
"aebr %%f0,%%f4 \n\t"
"ste %%f0,0(%6) \n\t"
"vrepf %%v4,%%v1,1 \n\t"
"aebr %%f1,%%f4 \n\t"
"vrepf %%v4,%%v1,2 \n\t"
"aebr %%f1,%%f4 \n\t"
"vrepf %%v4,%%v1,3 \n\t"
"veslg %%v4,%%v1,32 \n\t"
"vfasb %%v1,%%v1,%%v4 \n\t"
"vrepg %%v4,%%v1,1 \n\t"
"aebr %%f1,%%f4 \n\t"
"ste %%f1,4(%6) \n\t"
"vrepf %%v4,%%v2,1 \n\t"
"aebr %%f2,%%f4 \n\t"
"vrepf %%v4,%%v2,2 \n\t"
"aebr %%f2,%%f4 \n\t"
"vrepf %%v4,%%v2,3 \n\t"
"veslg %%v4,%%v2,32 \n\t"
"vfasb %%v2,%%v2,%%v4 \n\t"
"vrepg %%v4,%%v2,1 \n\t"
"aebr %%f2,%%f4 \n\t"
"ste %%f2,8(%6) \n\t"
"vrepf %%v4,%%v3,1 \n\t"
"aebr %%f3,%%f4 \n\t"
"vrepf %%v4,%%v3,2 \n\t"
"aebr %%f3,%%f4 \n\t"
"vrepf %%v4,%%v3,3 \n\t"
"veslg %%v4,%%v3,32 \n\t"
"vfasb %%v3,%%v3,%%v4 \n\t"
"vrepg %%v4,%%v3,1 \n\t"
"aebr %%f3,%%f4 \n\t"
"ste %%f3,12(%6) "
:
@ -281,18 +273,14 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepf %%v2,%%v0,1 \n\t"
"aebr %%f0,%%f2 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"aebr %%f0,%%f2 \n\t"
"vrepf %%v2,%%v0,3 \n\t"
"veslg %%v2,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v2 \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"aebr %%f0,%%f2 \n\t"
"ste %%f0,0(%4) \n\t"
"vrepf %%v2,%%v1,1 \n\t"
"aebr %%f1,%%f2 \n\t"
"vrepf %%v2,%%v1,2 \n\t"
"aebr %%f1,%%f2 \n\t"
"vrepf %%v2,%%v1,3 \n\t"
"veslg %%v2,%%v1,32 \n\t"
"vfasb %%v1,%%v1,%%v2 \n\t"
"vrepg %%v2,%%v1,1 \n\t"
"aebr %%f1,%%f2 \n\t"
"ste %%f1,4(%4) "
:
@ -370,11 +358,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
"brctg %%r0,2b \n\t"
"3: \n\t"
"vrepf %%v1,%%v0,1 \n\t"
"aebr %%f0,%%f1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"vrepf %%v1,%%v0,3 \n\t"
"veslg %%v1,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"aebr %%f0,%%f1 \n\t"
"ste %%f0,0(%3) "
:
@ -823,5 +809,3 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return(0);
}

View File

@ -74,7 +74,7 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x)
"vfmaxsb %%v16,%%v16,%%v17,0 \n\t"
"vfmaxsb %%v0,%%v0,%%16,0 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

View File

@ -74,7 +74,7 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x)
"vfminsb %%v16,%%v16,%%v17,0 \n\t"
"vfminsb %%v0,%%v0,%%16,0 \n\t"
"vfminsb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

40
param.h
View File

@ -2915,6 +2915,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#if defined(Z14)
#define SNUMOPT 2
#define DNUMOPT 2
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 4
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P 456
#define DGEMM_DEFAULT_P 320
#define CGEMM_DEFAULT_P 480
#define ZGEMM_DEFAULT_P 224
#define SGEMM_DEFAULT_Q 488
#define DGEMM_DEFAULT_Q 384
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 352
#define SGEMM_DEFAULT_R 8192
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 2048
#define SYMV_P 16
#endif
#ifdef GENERIC

View File

@ -37,4 +37,3 @@ clean:
-rm -f *.o $(UTESTBIN)
libs: