diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c index 4be8a5f9b..e47de2cb5 100644 --- a/kernel/power/dgemv_n_microk_power10.c +++ b/kernel/power/dgemv_n_microk_power10.c @@ -25,14 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -/************************************************************************************** -* 2016/03/30 Werner Saar (wernsaar@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - #define HAVE_KERNEL_4x4 1 static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) @@ -266,3 +258,145 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" ); } +static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y, double alpha) +{ + + double *a0; + double *a1; + double *a2; + double *a3; + double *a4; + double *a5; + double *a6; + double *a7; + long tmp; + __asm__ + ( + "lxvp 34, 0( %15) \n\t" // x0, x1 + "lxvp 38, 32( %15) \n\t" // x4, x5 + + XXSPLTD_S(58,%x14,0) // alpha, alpha + "sldi %10, %17, 3 \n\t" // lda * sizeof (double) + "xvmuldp 34, 34, 58 \n\t" // x0 * alpha, x1 * alpha + "xvmuldp 35, 35, 58 \n\t" // x2 * alpha, x3 * alpha + "xvmuldp 38, 38, 58 \n\t" // x4 * alpha, x5 * alpha + "xvmuldp 39, 39, 58 \n\t" // x6 * alpha, x7 * alpha + + "li %11, 32 \n\t" + + "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda + "add %10, %10, %10 \n\t" // 2 * lda + XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha + XXSPLTD_S(48,39,1) // x6 * alpha, x6 * alpha + XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha + XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha + XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha + + "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda + "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda + "add %7, %5, %10 \n\t" // a4 = a2 + 2 * lda + "add %8, %6, %10 \n\t" // a5 = a3 + 2 * lda + "add %9, %7, %10 \n\t" // a6 = a4 + 2 * lda + "add %10, %8, %10 \n\t" // a7 = a5 + 2 * lda + + "lxvp 40, 0( %3) \n\t" // a0[0], a0[1] + "lxvp 42, 0( %4) \n\t" // a1[0], a1[1] + "lxvp 44, 0( %5) \n\t" // a2[0], a2[1] + "lxvp 46, 0( %6) \n\t" // a3[0], a3[1] + "lxvp 50, 0( %7) \n\t" // a4[0] + "lxvp 52, 0( %8) \n\t" // a5[0] + "lxvp 54, 0( %9) \n\t" // a6[0] + "lxvp 56, 0( %10) \n\t" // a7[0] + + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "lxvp 36, 0( %2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 34 \n\t" + "xvmaddadp 37, 41, 34 \n\t" + "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] + "xvmaddadp 36, 42, 35 \n\t" + "xvmaddadp 37, 43, 35 \n\t" + "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] + "xvmaddadp 36, 44, 32 \n\t" + "xvmaddadp 37, 45, 32 \n\t" + "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] + "xvmaddadp 36, 46, 33 \n\t" + "xvmaddadp 37, 47, 33 \n\t" + "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] + "xvmaddadp 36, 50, 48 \n\t" + "xvmaddadp 37, 51, 48 \n\t" + "lxvpx 50, %7, %11 \n\t" // a4[0] + "xvmaddadp 36, 52, 49 \n\t" + "xvmaddadp 37, 53, 49 \n\t" + "lxvpx 52, %8, %11 \n\t" // a5[0] + "xvmaddadp 36, 54, 38 \n\t" + "xvmaddadp 37, 55, 38 \n\t" + "lxvpx 54, %9, %11 \n\t" // a6[0] + "xvmaddadp 36, 56, 39 \n\t" + "xvmaddadp 37, 57, 39 \n\t" + "lxvpx 56, %10, %11 \n\t" // a7[0] + "addi %11, %11, 32 \n\t" + + "stxvp 36, 0( %2) \n\t" // y0, y1 + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "lxvp 36, 0( %2) \n\t" // y0, y1 + "xvmaddadp 36, 40, 34 \n\t" + "xvmaddadp 37, 41, 34 \n\t" + "xvmaddadp 36, 42, 35 \n\t" + "xvmaddadp 37, 43, 35 \n\t" + "xvmaddadp 36, 44, 32 \n\t" + "xvmaddadp 37, 45, 32 \n\t" + "xvmaddadp 36, 46, 33 \n\t" + "xvmaddadp 37, 47, 33 \n\t" + "xvmaddadp 36, 50, 48 \n\t" + "xvmaddadp 37, 51, 48 \n\t" + "xvmaddadp 36, 52, 49 \n\t" + "xvmaddadp 37, 53, 49 \n\t" + "xvmaddadp 36, 54, 38 \n\t" + "xvmaddadp 37, 55, 38 \n\t" + "xvmaddadp 36, 56, 39 \n\t" + "xvmaddadp 37, 57, 39 \n\t" + "stxvp 36, 0( %2) \n\t" // y0, y1 + + : + "+m" (*y), + "+r" (n), // 1 + "+b" (y), // 2 + "=b" (a0), // 3 + "=b" (a1), // 4 + "=&b" (a2), // 5 + "=&b" (a3), // 6 + "=&b" (a4), // 7 + "=&b" (a5), // 8 + "=&b" (a6), // 9 + "=&b" (a7), // 10 + "=b" (tmp) + : + "m" (*x), + "m" (*ap), + "d" (alpha), // 14 + "r" (x), // 15 + "3" (ap), // 16 + "4" (lda) // 17 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48", + "vs49","vs50","vs51","vs52","vs53","vs54","vs55","vs56", "vs57", "vs58" + ); +} diff --git a/kernel/power/dgemv_n_power10.c b/kernel/power/dgemv_n_power10.c index ad5f1ba0d..aba15ab4e 100644 --- a/kernel/power/dgemv_n_power10.c +++ b/kernel/power/dgemv_n_power10.c @@ -26,165 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#include - -typedef __vector unsigned char vec_t; -typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); -typedef __vector_pair __attribute__((aligned(8))) vecp_t; #include "dgemv_n_microk_power10.c" -#define MMA(X, APTR, ACC) \ - rX = (vec_t *) & X; \ - rowA = *((vecp_t*)((void*)&APTR)); \ - __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]); - -#define SAVE(ACC, Z) \ - rowC = (v4sf_t *) &y[Z]; \ - __builtin_mma_disassemble_acc ((void *)result, ACC); \ - result[0][1] = result[1][0]; \ - result[2][1] = result[3][0]; \ - rowC[0] += valpha * result[0]; \ - rowC[1] += valpha * result[2]; - -void -dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo, - FLOAT * y, FLOAT alpha) -{ - BLASLONG i, j, tmp; - FLOAT *a0 = a_ptr; - FLOAT *x1 = xo; - vector double valpha = { alpha, alpha }; - v4sf_t *rowC; - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - v4sf_t result[4]; - vecp_t rowA; - vec_t *rX; - tmp = (n / 32) * 32; - for (i = 0; i < tmp; i += 32) - { - xo = x1; - a0 = a_ptr; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); - __builtin_mma_xxsetaccz (&acc2); - __builtin_mma_xxsetaccz (&acc3); - __builtin_mma_xxsetaccz (&acc4); - __builtin_mma_xxsetaccz (&acc5); - __builtin_mma_xxsetaccz (&acc6); - __builtin_mma_xxsetaccz (&acc7); - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - SAVE (&acc0, i + 0); - SAVE (&acc1, i + 4); - SAVE (&acc2, i + 8); - SAVE (&acc3, i + 12); - SAVE (&acc4, i + 16); - SAVE (&acc5, i + 20); - SAVE (&acc6, i + 24); - SAVE (&acc7, i + 28); - - } - for (i = tmp; i < n; i += 4) - { - xo = x1; - a0 = a_ptr; - __builtin_mma_xxsetaccz (&acc0); - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - SAVE (&acc0, i); - } -} - - #define NBMAX 4096 #ifndef HAVE_KERNEL_4x4 @@ -281,13 +125,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; - BLASLONG n1; BLASLONG m1; BLASLONG m2; BLASLONG m3; BLASLONG n2; BLASLONG lda4 = lda << 2; - BLASLONG lda128 = lda << 7; + BLASLONG lda8 = lda << 3; FLOAT xbuffer[8] __attribute__ ((aligned (16))); FLOAT *ybuffer; @@ -296,9 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n < 1 ) return(0); ybuffer = buffer; - BLASLONG n128 = n >> 7; - n1 = (n - (n128 * 128)) >> 2; - n2 = (n - (n128 * 128)) & 3; + BLASLONG n8 = n >> 3; + n2 = n & 3; m3 = m & 3 ; m1 = m & -4 ; @@ -329,14 +171,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( inc_x == 1 ) { - for( i = 0; i < n128 ; i++) + for( i = 0; i < n8 ; i++) { - dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha); - a_ptr += lda128; - x_ptr += 128; + dgemv_kernel_4x8(NB,a_ptr,lda,x_ptr,ybuffer,alpha); + a_ptr += lda8; + x_ptr += 8; } - for( i = 0; i < n1 ; i++) + if( n & 4 ) { dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha); a_ptr += lda4; @@ -363,20 +205,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } else { - for( i = 0; i < n128 ; i++) + for( i = 0; i < n8 ; i++) { - FLOAT xbuffer[128] __attribute__ ((aligned (16))); BLASLONG j; - for ( j = 0; j < 128 ; j++) + for ( j = 0; j < 8 ; j++) { xbuffer[j] = x_ptr[0]; x_ptr += inc_x; } - dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha); - a_ptr += lda128; + dgemv_kernel_4x8(NB,a_ptr,lda,xbuffer,ybuffer,alpha); + a_ptr += lda8; } - for( i = 0; i < n1 ; i++) + if( n & 4 ) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x;