Merge pull request #2012 from maamountki/z14

[ZARCH] Many improvements
This commit is contained in:
Martin Kroeker 2019-02-13 20:15:56 +01:00 committed by GitHub
commit 76bb74fcd4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
67 changed files with 13503 additions and 14618 deletions

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,27 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
{
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amax;
__asm__ volatile (
"vlef %%v0,0(%2),0 \n\t"
"vlef %%v16,4(%2),0 \n\t"
"vlef %%v0,8(%2),1 \n\t"
"vlef %%v16,12(%2),1 \n\t"
"vlef %%v0,16(%2),2 \n\t"
"vlef %%v16,20(%2),2 \n\t"
"vlef %%v0,24(%2),3 \n\t"
"vlef %%v16,28(%2),3 \n\t"
__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v16,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v16,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v16,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v16,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t"
@ -68,51 +60,42 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
"vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t"
"srlg %%r0,%1,5 \n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v2,16(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v2,48(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v2,80(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v2,112(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v2,144(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%[x])\n\t"
"vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v2,176(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%[x])\n\t"
"vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v2,208(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%[x])\n\t"
"vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v2,240(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%[x])\n\t"
"vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t"
@ -129,7 +112,6 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
"vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t"
@ -138,32 +120,26 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
"vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t"
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
"ler %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ler %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
return amax;
}
@ -174,7 +150,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf);
if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) {
@ -184,9 +161,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = camax_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
} else {
maxf = CABS1(x, 0);
ix += 2;
i++;
@ -228,7 +203,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,27 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
{
static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amin;
__asm__ volatile (
"vlef %%v0,0(%2),0 \n\t"
"vlef %%v16,4(%2),0 \n\t"
"vlef %%v0,8(%2),1 \n\t"
"vlef %%v16,12(%2),1 \n\t"
"vlef %%v0,16(%2),2 \n\t"
"vlef %%v16,20(%2),2 \n\t"
"vlef %%v0,24(%2),3 \n\t"
"vlef %%v16,28(%2),3 \n\t"
__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v16,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v16,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v16,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v16,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t"
@ -68,51 +60,42 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
"vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t"
"srlg %%r0,%1,5 \n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v2,16(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v2,48(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v2,80(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v2,112(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v2,144(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%[x])\n\t"
"vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v2,176(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%[x])\n\t"
"vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v2,208(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%[x])\n\t"
"vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v2,240(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%[x])\n\t"
"vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t"
@ -129,7 +112,6 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
"vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t"
@ -138,32 +120,26 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
"vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t"
"vfminsb %%v16,%%v16,%%v24,0\n\t"
"vfminsb %%v18,%%v18,%%v26,0\n\t"
"vfminsb %%v20,%%v20,%%v28,0\n\t"
"vfminsb %%v22,%%v22,%%v30,0\n\t"
"vfminsb %%v16,%%v16,%%v20,0\n\t"
"vfminsb %%v18,%%v18,%%v22,0\n\t"
"vfminsb %%v16,%%v16,%%v18,0\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,0\n\t"
"ler %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ler %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
return amin;
}
@ -174,7 +150,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf);
if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) {
@ -184,9 +161,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = camin_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
} else {
minf = CABS1(x, 0);
ix += 2;
i++;
@ -228,7 +203,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,34 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
{
static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,5 \n\t"
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
@ -64,25 +61,22 @@ static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
@ -91,70 +85,66 @@ static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v2 \n\t"
"vfasb %%v0,%%v0,%%v3 \n\t"
"veslg %%v1,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"ler %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ip = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf);
if (n <= 0 || inc_x <= 0)
return (sumf);
if ( inc_x == 1 )
{
if (inc_x == 1) {
n1 = n & -32;
if ( n1 > 0 )
{
if (n1 > 0) {
sumf = casum_kernel_32(n1, x);
i = n1;
ip = 2 * n1;
}
while(i < n)
{
while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
i++;
ip += 2;
}
}
else
{
} else {
inc_x2 = 2 * inc_x;
while(i < n)
{
while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
ip += inc_x2;
i++;
@ -163,5 +153,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return (sumf);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,100 +27,95 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__(
#if !defined(CONJ)
"vlrepf %%v0,0(%3) \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v1,4(%3),2 \n\t"
"vlrepf %%v0,0(%[alpha])\n\t"
"vlef %%v1,4(%[alpha]),0\n\t"
"vlef %%v1,4(%[alpha]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%3),1 \n\t"
"vlef %%v1,4(%3),3 \n\t"
"vlef %%v1,4(%[alpha]),1\n\t"
"vlef %%v1,4(%[alpha]),3\n\t"
#else
"vlef %%v0,0(%3),1 \n\t"
"vlef %%v0,0(%3),3 \n\t"
"vlef %%v0,0(%[alpha]),1\n\t"
"vlef %%v0,0(%[alpha]),3\n\t"
"vflcsb %%v0,%%v0\n\t"
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v0,0(%3),2 \n\t"
"vlrepf %%v1,4(%3) \n\t"
"vlef %%v0,0(%[alpha]),0\n\t"
"vlef %%v0,0(%[alpha]),2\n\t"
"vlrepf %%v1,4(%[alpha])\n\t"
#endif
"srlg %%r0,%0,4 \n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"
"vst %%v28,0(%%r1,%2) \n\t"
"vst %%v29,16(%%r1,%2) \n\t"
"vst %%v30,32(%%r1,%2) \n\t"
"vst %%v31,48(%%r1,%2) \n\t"
"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,80(%%r1,%1) \n\t"
"vl %%v18,96(%%r1,%1) \n\t"
"vl %%v19,112(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"
"vst %%v28,64(%%r1,%2) \n\t"
"vst %%v29,80(%%r1,%2) \n\t"
"vst %%v30,96(%%r1,%2) \n\t"
"vst %%v31,112(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v8,0(%%r1,%[x])\n\t"
"vl %%v9,16(%%r1,%[x])\n\t"
"vl %%v10,32(%%r1,%[x])\n\t"
"vl %%v11,48(%%r1,%[x])\n\t"
"vl %%v12,0(%%r1,%[y])\n\t"
"vl %%v13,16(%%r1,%[y])\n\t"
"vl %%v14,32(%%r1,%[y])\n\t"
"vl %%v15,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[x])\n\t"
"vl %%v17,80(%%r1,%[x])\n\t"
"vl %%v18,96(%%r1,%[x])\n\t"
"vl %%v19,112(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[y])\n\t"
"vl %%v21,80(%%r1,%[y])\n\t"
"vl %%v22,96(%%r1,%[y])\n\t"
"vl %%v23,112(%%r1,%[y])\n\t"
"verllg %%v24,%%v8,32\n\t"
"verllg %%v25,%%v9,32\n\t"
"verllg %%v26,%%v10,32\n\t"
"verllg %%v27,%%v11,32\n\t"
"verllg %%v28,%%v16,32\n\t"
"verllg %%v29,%%v17,32\n\t"
"verllg %%v30,%%v18,32\n\t"
"verllg %%v31,%%v19,32\n\t"
"vfmasb %%v8,%%v8,%%v0,%%v12\n\t"
"vfmasb %%v9,%%v9,%%v0,%%v13\n\t"
"vfmasb %%v10,%%v10,%%v0,%%v14\n\t"
"vfmasb %%v11,%%v11,%%v0,%%v15\n\t"
"vfmasb %%v16,%%v16,%%v0,%%v20\n\t"
"vfmasb %%v17,%%v17,%%v0,%%v21\n\t"
"vfmasb %%v18,%%v18,%%v0,%%v22\n\t"
"vfmasb %%v19,%%v19,%%v0,%%v23\n\t"
"vfmasb %%v8,%%v24,%%v1,%%v8\n\t"
"vfmasb %%v9,%%v25,%%v1,%%v9\n\t"
"vfmasb %%v10,%%v26,%%v1,%%v10\n\t"
"vfmasb %%v11,%%v27,%%v1,%%v11\n\t"
"vfmasb %%v16,%%v28,%%v1,%%v16\n\t"
"vfmasb %%v17,%%v29,%%v1,%%v17\n\t"
"vfmasb %%v18,%%v30,%%v1,%%v18\n\t"
"vfmasb %%v19,%%v31,%%v1,%%v19\n\t"
"vst %%v8,0(%%r1,%[y])\n\t"
"vst %%v9,16(%%r1,%[y])\n\t"
"vst %%v10,32(%%r1,%[y])\n\t"
"vst %%v11,48(%%r1,%[y])\n\t"
"vst %%v16,64(%%r1,%[y])\n\t"
"vst %%v17,80(%%r1,%[y])\n\t"
"vst %%v18,96(%%r1,%[y])\n\t"
"vst %%v19,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2] __attribute__ ((aligned(16)));
if (n <= 0) return (0);
if (n <= 0)
return (0);
if ((inc_x == 1) && (inc_y == 1)) {
@ -147,7 +142,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
}
return (0);
}
inc_x *= 2;
@ -170,5 +164,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
return (0);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,46 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,5 \n\t"
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],5\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","r2"
);
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x)
: "cc");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
if ( n <= 0 ) return(0);
if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
if (n1 > 0) {
ccopy_kernel_32(n1, x, y);
i = n1;
ix = n1 * 2;
iy = n1 * 2;
}
while(i < n)
{
while (i < n) {
y[iy] = x[iy];
y[iy + 1] = x[ix + 1];
ix += 2;
@ -75,16 +68,12 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
}
}
else
{
} else {
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
while(i < n)
{
while (i < n) {
y[iy] = x[ix];
y[iy + 1] = x[ix + 1];
ix += inc_x2;

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,10 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
__asm__ volatile(
"vzero %%v24 \n\t"
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
@ -38,25 +36,23 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %%r0,%0,4 \n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"verllg %%v22,%%v18,32\n\t"
"verllg %%v23,%%v19,32\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
@ -65,20 +61,18 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"vl %%v16, 64(%%r1,%1) \n\t"
"vl %%v17, 80(%%r1,%1) \n\t"
"vl %%v18, 96(%%r1,%1) \n\t"
"vl %%v19, 112(%%r1,%1) \n\t"
"vl %%v0, 64(%%r1,%2) \n\t"
"vl %%v1, 80(%%r1,%2) \n\t"
"vl %%v2, 96(%%r1,%2) \n\t"
"vl %%v3, 112(%%r1,%2) \n\t"
"vl %%v16, 64(%%r1,%[x])\n\t"
"vl %%v17, 80(%%r1,%[x])\n\t"
"vl %%v18, 96(%%r1,%[x])\n\t"
"vl %%v19, 112(%%r1,%[x])\n\t"
"vl %%v0, 64(%%r1,%[y])\n\t"
"vl %%v1, 80(%%r1,%[y])\n\t"
"vl %%v2, 96(%%r1,%[y])\n\t"
"vl %%v3, 112(%%r1,%[y])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"verllg %%v22,%%v18,32\n\t"
"verllg %%v23,%%v19,32\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
@ -87,9 +81,8 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b \n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
@ -100,21 +93,25 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vfasb %%v25,%%v25,%%v31\n\t"
"vrepg %%v27,%%v25,1\n\t"
"vfasb %%v25,%%v25,%%v27\n\t"
"vstef %%v24,0(%3),0 \n\t"
"vstef %%v24,4(%3),1 \n\t"
"vstef %%v25,8(%3),1 \n\t"
"vstef %%v25,12(%3),0 "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"vstef %%v24,0(%[d]),0\n\t"
"vstef %%v24,4(%[d]),1\n\t"
"vstef %%v25,8(%[d]),1\n\t"
"vstef %%v25,12(%[d]),0"
: "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
: [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y) {
BLASLONG i;
BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
FLOAT dot[4] __attribute__ ((aligned(16))) = {
0.0, 0.0, 0.0, 0.0};
if (n <= 0) {
CREAL(result) = 0.0;
@ -145,7 +142,6 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
}
} else {
i = 0;
ix = 0;
@ -178,5 +174,3 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
return (result);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -25,304 +25,347 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "common.h"
#define NBMAX 2048
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vlrepg %%v16,0(%5) \n\t"
"vlrepg %%v17,8(%5) \n\t"
"vlrepg %%v18,16(%5) \n\t"
"vlrepg %%v19,24(%5) \n\t"
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vlrepg %%v16,0(%[x])\n\t"
"vlrepg %%v17,8(%[x])\n\t"
"vlrepg %%v18,16(%[x])\n\t"
"vlrepg %%v19,24(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v20,4(%5),0 \n\t"
"vlef %%v20,4(%5),2 \n\t"
"vlef %%v20,4(%[x]),0\n\t"
"vlef %%v20,4(%[x]),2\n\t"
"vflcsb %%v20,%%v20\n\t"
"vlef %%v20,0(%5),1 \n\t"
"vlef %%v20,0(%5),3 \n\t"
"vlef %%v21,12(%5),0 \n\t"
"vlef %%v21,12(%5),2 \n\t"
"vlef %%v20,0(%[x]),1\n\t"
"vlef %%v20,0(%[x]),3\n\t"
"vlef %%v21,12(%[x]),0\n\t"
"vlef %%v21,12(%[x]),2\n\t"
"vflcsb %%v21,%%v21\n\t"
"vlef %%v21,8(%5),1 \n\t"
"vlef %%v21,8(%5),3 \n\t"
"vlef %%v22,20(%5),0 \n\t"
"vlef %%v22,20(%5),2 \n\t"
"vlef %%v21,8(%[x]),1\n\t"
"vlef %%v21,8(%[x]),3\n\t"
"vlef %%v22,20(%[x]),0\n\t"
"vlef %%v22,20(%[x]),2\n\t"
"vflcsb %%v22,%%v22\n\t"
"vlef %%v22,16(%5),1 \n\t"
"vlef %%v22,16(%5),3 \n\t"
"vlef %%v23,28(%5),0 \n\t"
"vlef %%v23,28(%5),2 \n\t"
"vlef %%v22,16(%[x]),1\n\t"
"vlef %%v22,16(%[x]),3\n\t"
"vlef %%v23,28(%[x]),0\n\t"
"vlef %%v23,28(%[x]),2\n\t"
"vflcsb %%v23,%%v23\n\t"
"vlef %%v23,24(%5),1 \n\t"
"vlef %%v23,24(%5),3 \n\t"
"vlef %%v23,24(%[x]),1\n\t"
"vlef %%v23,24(%[x]),3\n\t"
#else
"vlef %%v20,0(%5),1 \n\t"
"vlef %%v20,0(%5),3 \n\t"
"vlef %%v20,0(%[x]),1\n\t"
"vlef %%v20,0(%[x]),3\n\t"
"vflcsb %%v20,%%v20\n\t"
"vlef %%v20,4(%5),0 \n\t"
"vlef %%v20,4(%5),2 \n\t"
"vlef %%v21,8(%5),1 \n\t"
"vlef %%v21,8(%5),3 \n\t"
"vlef %%v20,4(%[x]),0\n\t"
"vlef %%v20,4(%[x]),2\n\t"
"vlef %%v21,8(%[x]),1\n\t"
"vlef %%v21,8(%[x]),3\n\t"
"vflcsb %%v21,%%v21\n\t"
"vlef %%v21,12(%5),0 \n\t"
"vlef %%v21,12(%5),2 \n\t"
"vlef %%v22,16(%5),1 \n\t"
"vlef %%v22,16(%5),3 \n\t"
"vlef %%v21,12(%[x]),0\n\t"
"vlef %%v21,12(%[x]),2\n\t"
"vlef %%v22,16(%[x]),1\n\t"
"vlef %%v22,16(%[x]),3\n\t"
"vflcsb %%v22,%%v22\n\t"
"vlef %%v22,20(%5),0 \n\t"
"vlef %%v22,20(%5),2 \n\t"
"vlef %%v23,24(%5),1 \n\t"
"vlef %%v23,24(%5),3 \n\t"
"vlef %%v22,20(%[x]),0\n\t"
"vlef %%v22,20(%[x]),2\n\t"
"vlef %%v23,24(%[x]),1\n\t"
"vlef %%v23,24(%[x]),3\n\t"
"vflcsb %%v23,%%v23\n\t"
"vlef %%v23,28(%5),0 \n\t"
"vlef %%v23,28(%5),2 \n\t"
"vlef %%v23,28(%[x]),0\n\t"
"vlef %%v23,28(%[x]),2\n\t"
#endif
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 2,1024(%%r1,%6) \n\t"
"vlef %%v24,0(%%r1,%1),0 \n\t"
"vlef %%v24,0(%%r1,%1),1 \n\t"
"vlef %%v24,8(%%r1,%1),2 \n\t"
"vlef %%v24,8(%%r1,%1),3 \n\t"
"vlef %%v25,4(%%r1,%1),0 \n\t"
"vlef %%v25,4(%%r1,%1),1 \n\t"
"vlef %%v25,12(%%r1,%1),2 \n\t"
"vlef %%v25,12(%%r1,%1),3 \n\t"
"vlef %%v26,0(%%r1,%2),0 \n\t"
"vlef %%v26,0(%%r1,%2),1 \n\t"
"vlef %%v26,8(%%r1,%2),2 \n\t"
"vlef %%v26,8(%%r1,%2),3 \n\t"
"vlef %%v27,4(%%r1,%2),0 \n\t"
"vlef %%v27,4(%%r1,%2),1 \n\t"
"vlef %%v27,12(%%r1,%2),2 \n\t"
"vlef %%v27,12(%%r1,%2),3 \n\t"
"vl %%v0,0(%%r1,%6) \n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vperm %%v25,%%v24,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v24,%%v1\n\t"
"vl %%v26,0(%%r1,%[ap1])\n\t"
"vperm %%v27,%%v26,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v26,%%v1\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v24,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v25,%%v20,%%v0\n\t"
"vfmasb %%v0,%%v26,%%v17,%%v0\n\t"
"vfmasb %%v0,%%v27,%%v21,%%v0\n\t"
"vlef %%v28,0(%%r1,%3),0 \n\t"
"vlef %%v28,0(%%r1,%3),1 \n\t"
"vlef %%v28,8(%%r1,%3),2 \n\t"
"vlef %%v28,8(%%r1,%3),3 \n\t"
"vlef %%v29,4(%%r1,%3),0 \n\t"
"vlef %%v29,4(%%r1,%3),1 \n\t"
"vlef %%v29,12(%%r1,%3),2 \n\t"
"vlef %%v29,12(%%r1,%3),3 \n\t"
"vlef %%v30,0(%%r1,%4),0 \n\t"
"vlef %%v30,0(%%r1,%4),1 \n\t"
"vlef %%v30,8(%%r1,%4),2 \n\t"
"vlef %%v30,8(%%r1,%4),3 \n\t"
"vlef %%v31,4(%%r1,%4),0 \n\t"
"vlef %%v31,4(%%r1,%4),1 \n\t"
"vlef %%v31,12(%%r1,%4),2 \n\t"
"vlef %%v31,12(%%r1,%4),3 \n\t"
"vl %%v28,0(%%r1,%[ap2])\n\t"
"vperm %%v29,%%v28,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v28,%%v1\n\t"
"vl %%v30,0(%%r1,%[ap3])\n\t"
"vperm %%v31,%%v30,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v30,%%v1\n\t"
"vfmasb %%v0,%%v28,%%v18,%%v0\n\t"
"vfmasb %%v0,%%v29,%%v22,%%v0\n\t"
"vfmasb %%v0,%%v30,%%v19,%%v0\n\t"
"vfmasb %%v0,%%v31,%%v23,%%v0\n\t"
"vst %%v0,0(%%r1,%6) \n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,0b \n\t"
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b\n\t"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vlrepg %%v16,0(%3) \n\t"
"vlrepg %%v17,8(%3) \n\t"
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vlrepg %%v16,0(%[x])\n\t"
"vlrepg %%v17,8(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v18,4(%3),0 \n\t"
"vlef %%v18,4(%3),2 \n\t"
"vlef %%v18,4(%[x]),0\n\t"
"vlef %%v18,4(%[x]),2\n\t"
"vflcsb %%v18,%%v18\n\t"
"vlef %%v18,0(%3),1 \n\t"
"vlef %%v18,0(%3),3 \n\t"
"vlef %%v19,12(%3),0 \n\t"
"vlef %%v19,12(%3),2 \n\t"
"vlef %%v18,0(%[x]),1\n\t"
"vlef %%v18,0(%[x]),3\n\t"
"vlef %%v19,12(%[x]),0\n\t"
"vlef %%v19,12(%[x]),2\n\t"
"vflcsb %%v19,%%v19\n\t"
"vlef %%v19,8(%3),1 \n\t"
"vlef %%v19,8(%3),3 \n\t"
"vlef %%v19,8(%[x]),1\n\t"
"vlef %%v19,8(%[x]),3\n\t"
#else
"vlef %%v18,0(%3),1 \n\t"
"vlef %%v18,0(%3),3 \n\t"
"vlef %%v18,0(%[x]),1\n\t"
"vlef %%v18,0(%[x]),3\n\t"
"vflcsb %%v18,%%v18\n\t"
"vlef %%v18,4(%3),0 \n\t"
"vlef %%v18,4(%3),2 \n\t"
"vlef %%v19,8(%3),1 \n\t"
"vlef %%v19,8(%3),3 \n\t"
"vlef %%v18,4(%[x]),0\n\t"
"vlef %%v18,4(%[x]),2\n\t"
"vlef %%v19,8(%[x]),1\n\t"
"vlef %%v19,8(%[x]),3\n\t"
"vflcsb %%v19,%%v19\n\t"
"vlef %%v19,12(%3),0 \n\t"
"vlef %%v19,12(%3),2 \n\t"
"vlef %%v19,12(%[x]),0\n\t"
"vlef %%v19,12(%[x]),2\n\t"
#endif
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%4) \n\t"
"vlef %%v20,0(%%r1,%1),0 \n\t"
"vlef %%v20,0(%%r1,%1),1 \n\t"
"vlef %%v20,8(%%r1,%1),2 \n\t"
"vlef %%v20,8(%%r1,%1),3 \n\t"
"vlef %%v21,4(%%r1,%1),0 \n\t"
"vlef %%v21,4(%%r1,%1),1 \n\t"
"vlef %%v21,12(%%r1,%1),2 \n\t"
"vlef %%v21,12(%%r1,%1),3 \n\t"
"vlef %%v22,0(%%r1,%2),0 \n\t"
"vlef %%v22,0(%%r1,%2),1 \n\t"
"vlef %%v22,8(%%r1,%2),2 \n\t"
"vlef %%v22,8(%%r1,%2),3 \n\t"
"vlef %%v23,4(%%r1,%2),0 \n\t"
"vlef %%v23,4(%%r1,%2),1 \n\t"
"vlef %%v23,12(%%r1,%2),2 \n\t"
"vlef %%v23,12(%%r1,%2),3 \n\t"
"vl %%v0,0(%%r1,%4) \n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v20,0(%%r1,%[ap0])\n\t"
"vperm %%v21,%%v20,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v20,%%v1\n\t"
"vl %%v22,0(%%r1,%[ap1])\n\t"
"vperm %%v23,%%v22,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v22,%%v1\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v20,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v21,%%v18,%%v0\n\t"
"vfmasb %%v0,%%v22,%%v17,%%v0\n\t"
"vfmasb %%v0,%%v23,%%v19,%%v0\n\t"
"vst %%v0,0(%%r1,%4) \n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,0b \n\t"
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
"brctg %[n],0b\n\t"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23");
}
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vlrepg %%v16,0(%2) \n\t"
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
__asm__("vlrepg %%v16,0(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v17,4(%2),0 \n\t"
"vlef %%v17,4(%2),2 \n\t"
"vlef %%v17,4(%[x]),0\n\t"
"vlef %%v17,4(%[x]),2\n\t"
"vflcsb %%v17,%%v17\n\t"
"vlef %%v17,0(%2),1 \n\t"
"vlef %%v17,0(%2),3 \n\t"
"vlef %%v17,0(%[x]),1\n\t"
"vlef %%v17,0(%[x]),3\n\t"
#else
"vlef %%v17,0(%2),1 \n\t"
"vlef %%v17,0(%2),3 \n\t"
"vlef %%v17,0(%[x]),1\n\t"
"vlef %%v17,0(%[x]),3\n\t"
"vflcsb %%v17,%%v17\n\t"
"vlef %%v17,4(%2),0 \n\t"
"vlef %%v17,4(%2),2 \n\t"
"vlef %%v17,4(%[x]),0\n\t"
"vlef %%v17,4(%[x]),2\n\t"
#endif
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vlef %%v18,0(%%r1,%1),0 \n\t"
"vlef %%v18,0(%%r1,%1),1 \n\t"
"vlef %%v18,8(%%r1,%1),2 \n\t"
"vlef %%v18,8(%%r1,%1),3 \n\t"
"vlef %%v19,4(%%r1,%1),0 \n\t"
"vlef %%v19,4(%%r1,%1),1 \n\t"
"vlef %%v19,12(%%r1,%1),2 \n\t"
"vlef %%v19,12(%%r1,%1),3 \n\t"
"vl %%v0,0(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v18,0(%%r1,%[ap])\n\t"
"vperm %%v19,%%v18,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v18,%%v1\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v18,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v19,%%v17,%%v0\n\t"
"vst %%v0,0(%%r1,%3) \n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,0b \n\t"
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19"
);
"brctg %[n],0b\n\t"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19");
}
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i)
{
__asm__ volatile (
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r,
FLOAT alpha_i) {
__asm__(
#if !defined(XCONJ)
"vlrepf %%v0,%3 \n\t"
"vlef %%v1,%4,0 \n\t"
"vlef %%v1,%4,2 \n\t"
"vlrepf %%v0,%[alpha_r]\n\t"
"vlef %%v1,%[alpha_i],0\n\t"
"vlef %%v1,%[alpha_i],2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,%4,1 \n\t"
"vlef %%v1,%4,3 \n\t"
"vlef %%v1,%[alpha_i],1\n\t"
"vlef %%v1,%[alpha_i],3\n\t"
#else
"vlef %%v0,%3,1 \n\t"
"vlef %%v0,%3,3 \n\t"
"vlef %%v0,%[alpha_r],1\n\t"
"vlef %%v0,%[alpha_r],3\n\t"
"vflcsb %%v0,%%v0\n\t"
"vlef %%v0,%3,0 \n\t"
"vlef %%v0,%3,2 \n\t"
"vlrepf %%v1,%4 \n\t"
"vlef %%v0,%[alpha_r],0\n\t"
"vlef %%v0,%[alpha_r],2\n\t"
"vlrepf %%v1,%[alpha_i]\n\t"
#endif
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,2 \n\t"
"srlg %[n],%[n],2\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,0(%%r1,%2) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%[src])\n\t"
"pfd 2,1024(%%r1,%[dest])\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v18,0(%%r1,%[dest])\n\t"
"vl %%v19,16(%%r1,%[dest])\n\t"
"verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t"
"vfmasb %%v22,%%v16,%%v0,%%v18\n\t"
"vfmasb %%v23,%%v17,%%v0,%%v19\n\t"
"vfmasb %%v22,%%v20,%%v1,%%v22\n\t"
"vfmasb %%v23,%%v21,%%v1,%%v23\n\t"
"vst %%v22,0(%%r1,%2) \n\t"
"vst %%v23,16(%%r1,%2) \n\t"
"vst %%v22,0(%%r1,%[dest])\n\t"
"vst %%v23,16(%%r1,%[dest])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n)
: [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src),
[src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23");
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i)
{
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,
FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
if ( inc_dest != 2 )
{
if (inc_dest != 2) {
FLOAT temp_r;
FLOAT temp_i;
for ( i=0; i<n; i++ )
{
for (i = 0; i < n; i++) {
#if !defined(XCONJ)
temp_r = alpha_r * src[0] - alpha_i * src[1];
temp_i = alpha_r * src[1] + alpha_i * src[0];
@ -343,8 +386,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
add_y_4(n, src, dest, alpha_r, alpha_i);
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i;
FLOAT *a_ptr;
FLOAT *x_ptr;
@ -358,8 +402,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
BLASLONG lda4;
FLOAT xbuffer[8], *ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
if (m < 1)
return (0);
if (n < 1)
return (0);
ybuffer = buffer;
@ -379,13 +425,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
while (NB == NBMAX) {
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
if (m1 < 0) {
if (m2 == 0)
break;
NB = m2;
}
@ -398,11 +443,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
//zero_y(NB,ybuffer);
memset(ybuffer, 0, NB * 8);
if ( inc_x == 2 )
{
if (inc_x == 2) {
for( i = 0; i < n1 ; i++)
{
for (i = 0; i < n1; i++) {
cgemv_kernel_4x4(NB, ap, x_ptr, ybuffer);
ap[0] += lda4;
ap[1] += lda4;
@ -412,27 +455,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 8;
}
if ( n2 & 2 )
{
if (n2 & 2) {
cgemv_kernel_4x2(NB, ap, x_ptr, ybuffer);
x_ptr += 4;
a_ptr += 2 * lda;
}
if ( n2 & 1 )
{
if (n2 & 1) {
cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
/* x_ptr += 2;
a_ptr += lda; */
}
}
else
{
} else {
for( i = 0; i < n1 ; i++)
{
for (i = 0; i < n1; i++) {
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
@ -455,8 +493,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
for (i = 0; i < n2; i++) {
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
@ -472,21 +509,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
y_ptr += NB * inc_y;
}
if ( m3 == 0 ) return(0);
if (m3 == 0)
return (0);
if ( m3 == 1 )
{
if (m3 == 1) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r = 0.0;
FLOAT temp_i = 0.0;
if ( lda == 2 && inc_x == 2 )
{
if (lda == 2 && inc_x == 2) {
for( i=0 ; i < (n & -2); i+=2 )
{
for (i = 0; i < (n & -2); i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -503,10 +537,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 4;
}
for( ; i < n; i++ )
{
for (; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -519,13 +550,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 2;
}
} else {
}
else
{
for( i = 0; i < n; i++ )
{
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -549,8 +576,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
return (0);
}
if ( m3 == 2 )
{
if (m3 == 2) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
@ -558,11 +584,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
if ( lda == 4 && inc_x == 2 )
{
if (lda == 4 && inc_x == 2) {
for( i = 0; i < (n & -2); i+=2 )
{
for (i = 0; i < (n & -2); i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
@ -592,9 +616,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 4;
}
for( ; i < n; i++ )
{
for (; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -611,13 +633,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 2;
}
} else {
}
else
{
for( i=0 ; i < n; i++ )
{
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -634,7 +652,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
@ -652,9 +669,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
return (0);
}
if ( m3 == 3 )
{
if (m3 == 3) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
@ -664,11 +679,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
FLOAT temp_r2 = 0.0;
FLOAT temp_i2 = 0.0;
if ( lda == 6 && inc_x == 2 )
{
if (lda == 6 && inc_x == 2) {
for( i=0 ; i < n; i++ )
{
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -689,13 +702,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 2;
}
} else {
}
else
{
for( i = 0; i < n; i++ )
{
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -29,84 +29,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NBMAX 2048
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vzero %%v16 \n\t"
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"vzero %%v18\n\t"
"vzero %%v19\n\t"
"vzero %%v20\n\t"
"vzero %%v21\n\t"
"vzero %%v22\n\t"
"vzero %%v23\n\t"
"vleib %%v2,0,0\n\t"
"vleib %%v2,1,1\n\t"
"vleib %%v2,2,2\n\t"
"vleib %%v2,3,3\n\t"
"vleib %%v2,0,4\n\t"
"vleib %%v2,1,5\n\t"
"vleib %%v2,2,6\n\t"
"vleib %%v2,3,7\n\t"
"vleib %%v2,8,8\n\t"
"vleib %%v2,9,9\n\t"
"vleib %%v2,10,10\n\t"
"vleib %%v2,11,11\n\t"
"vleib %%v2,8,12\n\t"
"vleib %%v2,9,13\n\t"
"vleib %%v2,10,14\n\t"
"vleib %%v2,11,15\n\t"
"vleib %%v3,4,0\n\t"
"vleib %%v3,5,1\n\t"
"vleib %%v3,6,2\n\t"
"vleib %%v3,7,3\n\t"
"vleib %%v3,4,4\n\t"
"vleib %%v3,5,5\n\t"
"vleib %%v3,6,6\n\t"
"vleib %%v3,7,7\n\t"
"vleib %%v3,12,8\n\t"
"vleib %%v3,13,9\n\t"
"vleib %%v3,14,10\n\t"
"vleib %%v3,15,11\n\t"
"vleib %%v3,12,12\n\t"
"vleib %%v3,13,13\n\t"
"vleib %%v3,14,14\n\t"
"vleib %%v3,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 1,1024(%%r1,%5) \n\t"
"vl %%v20,0(%%r1,%5) \n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v21,4(%%r1,%5),0 \n\t"
"vlef %%v21,12(%%r1,%5),2 \n\t"
"vflcsb %%v21,%%v21 \n\t"
"vlef %%v21,0(%%r1,%5),1 \n\t"
"vlef %%v21,8(%%r1,%5),3 \n\t"
"vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v1,12(%%r1,%[x]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v1,8(%%r1,%[x]),3\n\t"
#else
"vlef %%v21,0(%%r1,%5),1 \n\t"
"vlef %%v21,8(%%r1,%5),3 \n\t"
"vflcsb %%v21,%%v21 \n\t"
"vlef %%v21,4(%%r1,%5),0 \n\t"
"vlef %%v21,12(%%r1,%5),2 \n\t"
"vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v1,8(%%r1,%[x]),3\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v1,12(%%r1,%[x]),2\n\t"
#endif
"vlef %%v22,0(%%r1,%1),0 \n\t"
"vlef %%v22,0(%%r1,%1),1 \n\t"
"vlef %%v22,8(%%r1,%1),2 \n\t"
"vlef %%v22,8(%%r1,%1),3 \n\t"
"vlef %%v23,4(%%r1,%1),0 \n\t"
"vlef %%v23,4(%%r1,%1),1 \n\t"
"vlef %%v23,12(%%r1,%1),2 \n\t"
"vlef %%v23,12(%%r1,%1),3 \n\t"
"vlef %%v24,0(%%r1,%2),0 \n\t"
"vlef %%v24,0(%%r1,%2),1 \n\t"
"vlef %%v24,8(%%r1,%2),2 \n\t"
"vlef %%v24,8(%%r1,%2),3 \n\t"
"vlef %%v25,4(%%r1,%2),0 \n\t"
"vlef %%v25,4(%%r1,%2),1 \n\t"
"vlef %%v25,12(%%r1,%2),2 \n\t"
"vlef %%v25,12(%%r1,%2),3 \n\t"
"vfmasb %%v16,%%v22,%%v20,%%v16 \n\t"
"vfmasb %%v16,%%v23,%%v21,%%v16 \n\t"
"vfmasb %%v17,%%v24,%%v20,%%v17 \n\t"
"vfmasb %%v17,%%v25,%%v21,%%v17 \n\t"
"vlef %%v26,0(%%r1,%3),0 \n\t"
"vlef %%v26,0(%%r1,%3),1 \n\t"
"vlef %%v26,8(%%r1,%3),2 \n\t"
"vlef %%v26,8(%%r1,%3),3 \n\t"
"vlef %%v27,4(%%r1,%3),0 \n\t"
"vlef %%v27,4(%%r1,%3),1 \n\t"
"vlef %%v27,12(%%r1,%3),2 \n\t"
"vlef %%v27,12(%%r1,%3),3 \n\t"
"vlef %%v28,0(%%r1,%4),0 \n\t"
"vlef %%v28,0(%%r1,%4),1 \n\t"
"vlef %%v28,8(%%r1,%4),2 \n\t"
"vlef %%v28,8(%%r1,%4),3 \n\t"
"vlef %%v29,4(%%r1,%4),0 \n\t"
"vlef %%v29,4(%%r1,%4),1 \n\t"
"vlef %%v29,12(%%r1,%4),2 \n\t"
"vlef %%v29,12(%%r1,%4),3 \n\t"
"vfmasb %%v18,%%v26,%%v20,%%v18 \n\t"
"vfmasb %%v18,%%v27,%%v21,%%v18 \n\t"
"vfmasb %%v19,%%v28,%%v20,%%v19 \n\t"
"vfmasb %%v19,%%v29,%%v21,%%v19 \n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vperm %%v25,%%v24,%%v24,%%v3\n\t"
"vperm %%v24,%%v24,%%v24,%%v2\n\t"
"vl %%v26,0(%%r1,%[ap1])\n\t"
"vperm %%v27,%%v26,%%v26,%%v3\n\t"
"vperm %%v26,%%v26,%%v26,%%v2\n\t"
"vl %%v28,0(%%r1,%[ap2])\n\t"
"vperm %%v29,%%v28,%%v28,%%v3\n\t"
"vperm %%v28,%%v28,%%v28,%%v2\n\t"
"vl %%v30,0(%%r1,%[ap3])\n\t"
"vperm %%v31,%%v30,%%v30,%%v3\n\t"
"vperm %%v30,%%v30,%%v30,%%v2\n\t"
"vfmasb %%v16,%%v24,%%v0,%%v16\n\t"
"vfmasb %%v20,%%v25,%%v1,%%v20\n\t"
"vfmasb %%v17,%%v26,%%v0,%%v17\n\t"
"vfmasb %%v21,%%v27,%%v1,%%v21\n\t"
"vfmasb %%v18,%%v28,%%v0,%%v18\n\t"
"vfmasb %%v22,%%v29,%%v1,%%v22\n\t"
"vfmasb %%v19,%%v30,%%v0,%%v19\n\t"
"vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,0b \n\t"
"brctg %[n],0b\n\t"
"vfasb %%v16,%%v16,%%v20\n\t"
"vfasb %%v17,%%v17,%%v21\n\t"
"vfasb %%v18,%%v18,%%v22\n\t"
"vfasb %%v19,%%v19,%%v23\n\t"
"vrepg %%v20,%%v16,1\n\t"
"vrepg %%v21,%%v17,1\n\t"
"vrepg %%v22,%%v18,1\n\t"
@ -120,86 +137,115 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *
"verllg %%v18,%%v16,32\n\t"
"verllg %%v19,%%v17,32\n\t"
#if !defined(XCONJ)
"vlrepf %%v20,0(%7) \n\t"
"vlef %%v21,4(%7),0 \n\t"
"vlef %%v21,4(%7),2 \n\t"
"vlrepf %%v20,0(%[alpha])\n\t"
"vlef %%v21,4(%[alpha]),0\n\t"
"vlef %%v21,4(%[alpha]),2\n\t"
"vflcsb %%v21,%%v21\n\t"
"vlef %%v21,4(%7),1 \n\t"
"vlef %%v21,4(%7),3 \n\t"
"vlef %%v21,4(%[alpha]),1\n\t"
"vlef %%v21,4(%[alpha]),3\n\t"
#else
"vlef %%v20,0(%7),1 \n\t"
"vlef %%v20,0(%7),3 \n\t"
"vlef %%v20,0(%[alpha]),1\n\t"
"vlef %%v20,0(%[alpha]),3\n\t"
"vflcsb %%v20,%%v20\n\t"
"vlef %%v20,0(%7),0 \n\t"
"vlef %%v20,0(%7),2 \n\t"
"vlrepf %%v21,4(%7) \n\t"
"vlef %%v20,0(%[alpha]),0\n\t"
"vlef %%v20,0(%[alpha]),2\n\t"
"vlrepf %%v21,4(%[alpha])\n\t"
#endif
"vl %%v22,0(%6) \n\t"
"vl %%v23,16(%6) \n\t"
"vl %%v22,0(%[y])\n\t"
"vl %%v23,16(%[y])\n\t"
"vfmasb %%v22,%%v16,%%v20,%%v22\n\t"
"vfmasb %%v22,%%v18,%%v21,%%v22\n\t"
"vfmasb %%v23,%%v17,%%v20,%%v23\n\t"
"vfmasb %%v23,%%v19,%%v21,%%v23\n\t"
"vst %%v22,0(%6) \n\t"
"vst %%v23,16(%6) "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[8])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29"
);
"vst %%v22,0(%[y])\n\t"
"vst %%v23,16(%[y])"
: "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vzero %%v16 \n\t"
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"vzero %%v18\n\t"
"vzero %%v19\n\t"
"vleib %%v2,0,0\n\t"
"vleib %%v2,1,1\n\t"
"vleib %%v2,2,2\n\t"
"vleib %%v2,3,3\n\t"
"vleib %%v2,0,4\n\t"
"vleib %%v2,1,5\n\t"
"vleib %%v2,2,6\n\t"
"vleib %%v2,3,7\n\t"
"vleib %%v2,8,8\n\t"
"vleib %%v2,9,9\n\t"
"vleib %%v2,10,10\n\t"
"vleib %%v2,11,11\n\t"
"vleib %%v2,8,12\n\t"
"vleib %%v2,9,13\n\t"
"vleib %%v2,10,14\n\t"
"vleib %%v2,11,15\n\t"
"vleib %%v3,4,0\n\t"
"vleib %%v3,5,1\n\t"
"vleib %%v3,6,2\n\t"
"vleib %%v3,7,3\n\t"
"vleib %%v3,4,4\n\t"
"vleib %%v3,5,5\n\t"
"vleib %%v3,6,6\n\t"
"vleib %%v3,7,7\n\t"
"vleib %%v3,12,8\n\t"
"vleib %%v3,13,9\n\t"
"vleib %%v3,14,10\n\t"
"vleib %%v3,15,11\n\t"
"vleib %%v3,12,12\n\t"
"vleib %%v3,13,13\n\t"
"vleib %%v3,14,14\n\t"
"vleib %%v3,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v19,4(%%r1,%3),0 \n\t"
"vlef %%v19,12(%%r1,%3),2 \n\t"
"vflcsb %%v19,%%v19 \n\t"
"vlef %%v19,0(%%r1,%3),1 \n\t"
"vlef %%v19,8(%%r1,%3),3 \n\t"
"vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v1,12(%%r1,%[x]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v1,8(%%r1,%[x]),3\n\t"
#else
"vlef %%v19,0(%%r1,%3),1 \n\t"
"vlef %%v19,8(%%r1,%3),3 \n\t"
"vflcsb %%v19,%%v19 \n\t"
"vlef %%v19,4(%%r1,%3),0 \n\t"
"vlef %%v19,12(%%r1,%3),2 \n\t"
"vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v1,8(%%r1,%[x]),3\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v1,12(%%r1,%[x]),2\n\t"
#endif
"vlef %%v20,0(%%r1,%1),0 \n\t"
"vlef %%v20,0(%%r1,%1),1 \n\t"
"vlef %%v20,8(%%r1,%1),2 \n\t"
"vlef %%v20,8(%%r1,%1),3 \n\t"
"vlef %%v21,4(%%r1,%1),0 \n\t"
"vlef %%v21,4(%%r1,%1),1 \n\t"
"vlef %%v21,12(%%r1,%1),2 \n\t"
"vlef %%v21,12(%%r1,%1),3 \n\t"
"vlef %%v22,0(%%r1,%2),0 \n\t"
"vlef %%v22,0(%%r1,%2),1 \n\t"
"vlef %%v22,8(%%r1,%2),2 \n\t"
"vlef %%v22,8(%%r1,%2),3 \n\t"
"vlef %%v23,4(%%r1,%2),0 \n\t"
"vlef %%v23,4(%%r1,%2),1 \n\t"
"vlef %%v23,12(%%r1,%2),2 \n\t"
"vlef %%v23,12(%%r1,%2),3 \n\t"
"vfmasb %%v16,%%v20,%%v18,%%v16 \n\t"
"vfmasb %%v16,%%v21,%%v19,%%v16 \n\t"
"vfmasb %%v17,%%v22,%%v18,%%v17 \n\t"
"vfmasb %%v17,%%v23,%%v19,%%v17 \n\t"
"vl %%v20,0(%%r1,%[ap0])\n\t"
"vperm %%v21,%%v20,%%v20,%%v3\n\t"
"vperm %%v20,%%v20,%%v20,%%v2\n\t"
"vl %%v22,0(%%r1,%[ap1])\n\t"
"vperm %%v23,%%v22,%%v22,%%v3\n\t"
"vperm %%v22,%%v22,%%v22,%%v2\n\t"
"vfmasb %%v16,%%v20,%%v0,%%v16\n\t"
"vfmasb %%v18,%%v21,%%v1,%%v18\n\t"
"vfmasb %%v17,%%v22,%%v0,%%v17\n\t"
"vfmasb %%v19,%%v23,%%v1,%%v19\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,0b \n\t"
"brctg %[n],0b\n\t"
"vfasb %%v16,%%v16,%%v18\n\t"
"vfasb %%v17,%%v17,%%v19\n\t"
"vrepg %%v18,%%v16,1\n\t"
"vrepg %%v19,%%v17,1\n\t"
"vfasb %%v16,%%v16,%%v18\n\t"
@ -207,99 +253,124 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *
"vmrhg %%v16,%%v16,%%v17\n\t"
"verllg %%v17,%%v16,32\n\t"
#if !defined(XCONJ)
"vlrepf %%v18,0(%5) \n\t"
"vlef %%v19,4(%5),0 \n\t"
"vlef %%v19,4(%5),2 \n\t"
"vlrepf %%v18,0(%[alpha])\n\t"
"vlef %%v19,4(%[alpha]),0\n\t"
"vlef %%v19,4(%[alpha]),2\n\t"
"vflcsb %%v19,%%v19\n\t"
"vlef %%v19,4(%5),1 \n\t"
"vlef %%v19,4(%5),3 \n\t"
"vlef %%v19,4(%[alpha]),1\n\t"
"vlef %%v19,4(%[alpha]),3\n\t"
#else
"vlef %%v18,0(%5),1 \n\t"
"vlef %%v18,0(%5),3 \n\t"
"vlef %%v18,0(%[alpha]),1\n\t"
"vlef %%v18,0(%[alpha]),3\n\t"
"vflcsb %%v18,%%v18\n\t"
"vlef %%v18,0(%5),0 \n\t"
"vlef %%v18,0(%5),2 \n\t"
"vlrepf %%v19,4(%5) \n\t"
"vlef %%v18,0(%[alpha]),0\n\t"
"vlef %%v18,0(%[alpha]),2\n\t"
"vlrepf %%v19,4(%[alpha])\n\t"
#endif
"vl %%v20,0(%4) \n\t"
"vl %%v20,0(%[y])\n\t"
"vfmasb %%v20,%%v16,%%v18,%%v20\n\t"
"vfmasb %%v20,%%v17,%%v19,%%v20\n\t"
"vst %%v20,0(%4) "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[4])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23"
);
"vst %%v20,0(%[y])"
: "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23");
}
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vzero %%v16 \n\t"
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"vleib %%v2,0,0\n\t"
"vleib %%v2,1,1\n\t"
"vleib %%v2,2,2\n\t"
"vleib %%v2,3,3\n\t"
"vleib %%v2,0,4\n\t"
"vleib %%v2,1,5\n\t"
"vleib %%v2,2,6\n\t"
"vleib %%v2,3,7\n\t"
"vleib %%v2,8,8\n\t"
"vleib %%v2,9,9\n\t"
"vleib %%v2,10,10\n\t"
"vleib %%v2,11,11\n\t"
"vleib %%v2,8,12\n\t"
"vleib %%v2,9,13\n\t"
"vleib %%v2,10,14\n\t"
"vleib %%v2,11,15\n\t"
"vleib %%v3,4,0\n\t"
"vleib %%v3,5,1\n\t"
"vleib %%v3,6,2\n\t"
"vleib %%v3,7,3\n\t"
"vleib %%v3,4,4\n\t"
"vleib %%v3,5,5\n\t"
"vleib %%v3,6,6\n\t"
"vleib %%v3,7,7\n\t"
"vleib %%v3,12,8\n\t"
"vleib %%v3,13,9\n\t"
"vleib %%v3,14,10\n\t"
"vleib %%v3,15,11\n\t"
"vleib %%v3,12,12\n\t"
"vleib %%v3,13,13\n\t"
"vleib %%v3,14,14\n\t"
"vleib %%v3,15,15\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v18,4(%%r1,%2),0 \n\t"
"vlef %%v18,12(%%r1,%2),2 \n\t"
"vflcsb %%v18,%%v18 \n\t"
"vlef %%v18,0(%%r1,%2),1 \n\t"
"vlef %%v18,8(%%r1,%2),3 \n\t"
"vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v1,12(%%r1,%[x]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v1,8(%%r1,%[x]),3\n\t"
#else
"vlef %%v18,0(%%r1,%2),1 \n\t"
"vlef %%v18,8(%%r1,%2),3 \n\t"
"vflcsb %%v18,%%v18 \n\t"
"vlef %%v18,4(%%r1,%2),0 \n\t"
"vlef %%v18,12(%%r1,%2),2 \n\t"
"vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v1,8(%%r1,%[x]),3\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v1,12(%%r1,%[x]),2\n\t"
#endif
"vlef %%v19,0(%%r1,%1),0 \n\t"
"vlef %%v19,0(%%r1,%1),1 \n\t"
"vlef %%v19,8(%%r1,%1),2 \n\t"
"vlef %%v19,8(%%r1,%1),3 \n\t"
"vlef %%v20,4(%%r1,%1),0 \n\t"
"vlef %%v20,4(%%r1,%1),1 \n\t"
"vlef %%v20,12(%%r1,%1),2 \n\t"
"vlef %%v20,12(%%r1,%1),3 \n\t"
"vfmasb %%v16,%%v19,%%v17,%%v16 \n\t"
"vfmasb %%v16,%%v20,%%v18,%%v16 \n\t"
"vl %%v18,0(%%r1,%[ap])\n\t"
"vperm %%v19,%%v18,%%v18,%%v3\n\t"
"vperm %%v18,%%v18,%%v18,%%v2\n\t"
"vfmasb %%v16,%%v18,%%v0,%%v16\n\t"
"vfmasb %%v17,%%v19,%%v1,%%v17\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,0b \n\t"
"brctg %[n],0b\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vrepg %%v17,%%v16,1\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"verllg %%v17,%%v16,32\n\t"
#if !defined(XCONJ)
"vlrepf %%v18,0(%4) \n\t"
"vlef %%v19,4(%4),0 \n\t"
"vlrepf %%v18,0(%[alpha])\n\t"
"vlef %%v19,4(%[alpha]),0\n\t"
"vflcsb %%v19,%%v19\n\t"
"vlef %%v19,4(%4),1 \n\t"
"vlef %%v19,4(%[alpha]),1\n\t"
#else
"vlef %%v18,0(%4),1 \n\t"
"vlef %%v18,0(%[alpha]),1\n\t"
"vflcsb %%v18,%%v18\n\t"
"vlef %%v18,0(%4),0 \n\t"
"vlrepf %%v19,4(%4) \n\t"
"vlef %%v18,0(%[alpha]),0\n\t"
"vlrepf %%v19,4(%[alpha])\n\t"
#endif
"vleg %%v20,0(%3),0 \n\t"
"vfmasb %%v20,%%v16,%%v18,%%v20 \n\t"
"vfmasb %%v20,%%v17,%%v19,%%v20 \n\t"
"vsteg %%v20,0(%3),0 "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[2])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23"
);
"vleg %%v0,0(%[y]),0\n\t"
"vfmasb %%v0,%%v16,%%v18,%%v0\n\t"
"vfmasb %%v0,%%v17,%%v19,%%v0\n\t"
"vsteg %%v0,0(%[y]),0"
: "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
BLASLONG i;
for ( i=0; i<n; i++ )
{
for (i = 0; i < n; i++) {
*dest = *src;
*(dest + 1) = *(src + 1);
dest += 2;
@ -307,8 +378,9 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
@ -324,8 +396,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT ybuffer[8], *xbuffer;
FLOAT alpha[2];
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
if (m < 1)
return (0);
if (n < 1)
return (0);
inc_x <<= 1;
inc_y <<= 1;
@ -346,13 +420,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
while (NB == NBMAX) {
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
if (m1 < 0) {
if (m2 == 0)
break;
NB = m2;
}
@ -368,11 +441,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
else
xbuffer = x_ptr;
if ( inc_y == 2 )
{
if (inc_y == 2) {
for( i = 0; i < n1 ; i++)
{
for (i = 0; i < n1; i++) {
cgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha);
ap[0] += lda4;
ap[1] += lda4;
@ -383,28 +454,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
}
if ( n2 & 2 )
{
if (n2 & 2) {
cgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha);
a_ptr += lda * 2;
y_ptr += 4;
}
if ( n2 & 1 )
{
if (n2 & 1) {
cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
/* a_ptr += lda;
y_ptr += 2; */
}
}
else
{
} else {
for( i = 0; i < n1 ; i++)
{
for (i = 0; i < n1; i++) {
memset(ybuffer, 0, sizeof(ybuffer));
cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha);
ap[0] += lda4;
@ -428,8 +494,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
}
for( i = 0; i < n2 ; i++)
{
for (i = 0; i < n2; i++) {
memset(ybuffer, 0, sizeof(ybuffer));
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha);
a_ptr += lda;
@ -444,17 +509,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
x += NB * inc_x;
}
if ( m3 == 0 ) return(0);
if (m3 == 0)
return (0);
x_ptr = x;
j = 0;
a_ptr = a;
y_ptr = y;
if ( m3 == 3 )
{
if (m3 == 3) {
FLOAT temp_r;
FLOAT temp_i;
@ -466,8 +529,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
x_ptr += inc_x;
FLOAT x4 = x_ptr[0];
FLOAT x5 = x_ptr[1];
while ( j < n)
{
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -500,9 +562,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
return (0);
}
if ( m3 == 2 )
{
if (m3 == 2) {
FLOAT temp_r;
FLOAT temp_i;
@ -516,8 +576,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT ar = alpha[0];
FLOAT ai = alpha[1];
while ( j < ( n & -2 ))
{
while (j < (n & -2)) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -560,9 +619,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
j += 2;
}
while ( j < n)
{
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -592,9 +649,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
return (0);
}
if ( m3 == 1 )
{
if (m3 == 1) {
FLOAT temp_r;
FLOAT temp_i;
@ -605,8 +660,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT ar = alpha[0];
FLOAT ai = alpha[1];
while ( j < ( n & -2 ))
{
while (j < (n & -2)) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -641,8 +695,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
j += 2;
}
while ( j < n)
{
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,25 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepf %%v0,%3 \n\t"
"vlrepf %%v1,%4 \n\t"
"srlg %%r0,%0,5 \n\t"
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
__asm__("vlrepf %%v0,%[c]\n\t"
"vlrepf %%v1,%[s]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -63,25 +60,22 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -99,25 +93,22 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -135,25 +126,22 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -171,40 +159,39 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT c, FLOAT s) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;
if ( n <= 0 ) return(0);
if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
if (n1 > 0) {
FLOAT cosa, sina;
cosa = c;
sina = s;
@ -213,8 +200,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
ix = 2 * n1;
}
while(i < n)
{
while (i < n) {
temp[0] = c * x[ix] + s * y[ix];
temp[1] = c * x[ix + 1] + s * y[ix + 1];
y[ix] = c * y[ix] - s * x[ix];
@ -227,14 +213,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
}
}
else
{
} else {
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while(i < n)
{
while (i < n) {
temp[0] = c * x[ix] + s * y[iy];
temp[1] = c * x[ix + 1] + s * y[iy + 1];
y[iy] = c * y[iy] - s * x[ix];
@ -252,5 +234,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
return (0);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013 - 2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,28 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepf %%v0,0(%1) \n\t"
"vlef %%v1,4(%1),0 \n\t"
"vlef %%v1,4(%1),2 \n\t"
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlrepf %%v0,0(%[alpha])\n\t"
"vlef %%v1,4(%[alpha]),0\n\t"
"vlef %%v1,4(%[alpha]),2\n\t"
"vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%1),1 \n\t"
"vlef %%v1,4(%1),3 \n\t"
"srlg %%r0,%0,4 \n\t"
"vlef %%v1,4(%[alpha]),1\n\t"
"vlef %%v1,4(%[alpha]),3\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"verllg %%v24,%%v16,32\n\t"
"verllg %%v25,%%v17,32\n\t"
"verllg %%v26,%%v18,32\n\t"
@ -57,7 +54,6 @@ static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
"verllg %%v29,%%v21,32\n\t"
"verllg %%v30,%%v22,32\n\t"
"verllg %%v31,%%v23,32\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t"
@ -74,45 +70,42 @@ static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlef %%v0,4(%1),0 \n\t"
"vlef %%v0,4(%1),2 \n\t"
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlef %%v0,4(%[alpha]),0\n\t"
"vlef %%v0,4(%[alpha]),2\n\t"
"vflcsb %%v0,%%v0\n\t"
"vlef %%v0,4(%1),1 \n\t"
"vlef %%v0,4(%1),3 \n\t"
"srlg %%r0,%0,4 \n\t"
"vlef %%v0,4(%[alpha]),1\n\t"
"vlef %%v0,4(%[alpha]),3\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"verllg %%v16,%%v16,32\n\t"
"verllg %%v17,%%v17,32\n\t"
"verllg %%v18,%%v18,32\n\t"
@ -121,7 +114,6 @@ static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
"verllg %%v21,%%v21,32\n\t"
"verllg %%v22,%%v22,32\n\t"
"verllg %%v23,%%v23,32\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t"
@ -130,42 +122,37 @@ static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
}
static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepf %%v0,0(%1) \n\t"
"srlg %%r0,%0,4 \n\t"
static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlrepf %%v0,0(%[alpha])\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t"
@ -174,55 +161,46 @@ static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
}
static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t"
static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
__asm__("vzero %%v0\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
}
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
{
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
BLASLONG inc_x) {
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
@ -230,8 +208,7 @@ static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4)
{
for (i = 0; i < n; i += 4) {
t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
@ -251,7 +228,9 @@ static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
}
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
FLOAT temp0;
FLOAT temp1;
@ -311,13 +290,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
}
}
} else {
if (da_i == 0.0) {
BLASLONG n1 = n & -2;
@ -372,7 +348,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
return (0);
}
BLASLONG n1 = n & -16;
if (n1 > 0) {
@ -384,8 +359,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
cscal_kernel_16_zero(n1, x);
else
cscal_kernel_16_zero_r(n1, alpha, x);
else
if (da_i == 0)
else if (da_i == 0)
cscal_kernel_16_zero_i(n1, alpha, x);
else
cscal_kernel_16(n1, alpha, x);
@ -394,7 +368,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
j = n1;
}
if (da_r == 0.0) {
if (da_i == 0.0) {

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,114 +27,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,5 \n\t"
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp[2];
BLASLONG inc_x2, inc_y2;
if ( n <= 0 ) return(0);
if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
if (n1 > 0) {
cswap_kernel_32(n1, x, y);
i = n1;
ix = 2 * n1;
iy = 2 * n1;
}
while(i < n)
{
while (i < n) {
temp[0] = x[ix];
temp[1] = x[ix + 1];
@ -147,19 +141,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
iy += 2;
i++;
}
}
else
{
} else {
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while(i < n)
{
while (i < n) {
temp[0] = x[ix];
temp[1] = x[ix + 1];
@ -177,7 +166,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
}
return (0);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,40 +28,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
{
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amax;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxdb %%v16,%%v16,%%v24,8\n\t"
"vfmaxdb %%v17,%%v17,%%v25,8\n\t"
"vfmaxdb %%v18,%%v18,%%v26,8\n\t"
@ -70,29 +62,23 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
"vfmaxdb %%v21,%%v21,%%v29,8\n\t"
"vfmaxdb %%v22,%%v22,%%v30,8\n\t"
"vfmaxdb %%v23,%%v23,%%v31,8\n\t"
"vfmaxdb %%v16,%%v16,%%v20,8\n\t"
"vfmaxdb %%v17,%%v17,%%v21,8\n\t"
"vfmaxdb %%v18,%%v18,%%v22,8\n\t"
"vfmaxdb %%v19,%%v19,%%v23,8\n\t"
"vfmaxdb %%v16,%%v16,%%v18,8\n\t"
"vfmaxdb %%v17,%%v17,%%v19,8\n\t"
"vfmaxdb %%v16,%%v16,%%v17,8\n\t"
"vfmaxdb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,8\n\t"
"lpdr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"lpdr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amax;
}
@ -102,7 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) {
@ -112,9 +99,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = damax_kernel_32(n1, x);
i = n1;
}
else
{
} else {
maxf = ABS(x[0]);
i++;
}
@ -153,7 +138,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,32 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
{
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amax;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vflpdb %%v0,%%v0\n\t"
"srlg %%r0,%1,5 \n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -62,7 +55,6 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
@ -71,26 +63,22 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -99,7 +87,6 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
@ -108,29 +95,24 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amax;
}
@ -140,7 +122,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) {
@ -150,9 +133,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = damax_kernel_32(n1, x);
i = n1;
}
else
{
} else {
maxf = ABS(x[0]);
i++;
}
@ -191,7 +172,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,40 +28,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
{
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amin;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmindb %%v16,%%v16,%%v24,8\n\t"
"vfmindb %%v17,%%v17,%%v25,8\n\t"
"vfmindb %%v18,%%v18,%%v26,8\n\t"
@ -70,29 +62,23 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
"vfmindb %%v21,%%v21,%%v29,8\n\t"
"vfmindb %%v22,%%v22,%%v30,8\n\t"
"vfmindb %%v23,%%v23,%%v31,8\n\t"
"vfmindb %%v16,%%v16,%%v20,8\n\t"
"vfmindb %%v17,%%v17,%%v21,8\n\t"
"vfmindb %%v18,%%v18,%%v22,8\n\t"
"vfmindb %%v19,%%v19,%%v23,8\n\t"
"vfmindb %%v16,%%v16,%%v18,8\n\t"
"vfmindb %%v17,%%v17,%%v19,8\n\t"
"vfmindb %%v16,%%v16,%%v17,8\n\t"
"vfmindb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,8\n\t"
"lpdr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"lpdr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amin;
}
@ -102,7 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) {
@ -112,9 +99,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = damin_kernel_32(n1, x);
i = n1;
}
else
{
} else {
minf = ABS(x[0]);
i++;
}
@ -153,7 +138,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,32 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
{
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amin;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vflpdb %%v0,%%v0\n\t"
"srlg %%r0,%1,5 \n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -62,7 +55,6 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
@ -71,26 +63,22 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -99,7 +87,6 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
@ -108,29 +95,24 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amin;
}
@ -140,7 +122,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) {
@ -150,9 +133,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = damin_kernel_32(n1, x);
i = n1;
}
else
{
} else {
minf = ABS(x[0]);
i++;
}
@ -191,7 +172,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,34 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
{
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,5 \n\t"
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -64,25 +61,22 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -91,28 +85,30 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v2 \n\t"
"vfadb %%v0,%%v0,%%v3 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum;
}
@ -123,7 +119,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT sumf = 0.0;
BLASLONG n1;
if (n <= 0 || inc_x <= 0) return sumf;
if (n <= 0 || inc_x <= 0)
return sumf;
if (inc_x == 1) {
@ -164,9 +161,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
j++;
}
}
return sumf;
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,107 +27,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
"vlrepg %%v0,%3 \n\t"
"srlg %%r0,%0,5 \n\t"
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__("vlrepg %%v0,%[alpha]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,0(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%[y])\n\t"
"vl %%v24,64(%%r1,%[x])\n\t"
"vl %%v25,80(%%r1,%[x])\n\t"
"vl %%v26,96(%%r1,%[x])\n\t"
"vl %%v27,112(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,80(%%r1,%1) \n\t"
"vl %%v26,96(%%r1,%1) \n\t"
"vl %%v27,112(%%r1,%1) \n\t"
"vl %%v28,64(%%r1,%2) \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vl %%v30,96(%%r1,%2) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vl %%v16,128(%%r1,%1) \n\t"
"vl %%v17,144(%%r1,%1) \n\t"
"vl %%v18,160(%%r1,%1) \n\t"
"vl %%v19,176(%%r1,%1) \n\t"
"vl %%v20,128(%%r1,%2) \n\t"
"vl %%v21,144(%%r1,%2) \n\t"
"vl %%v22,160(%%r1,%2) \n\t"
"vl %%v23,176(%%r1,%2) \n\t"
"vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,0(%%r1,%[y])\n\t"
"vst %%v17,16(%%r1,%[y])\n\t"
"vst %%v18,32(%%r1,%[y])\n\t"
"vst %%v19,48(%%r1,%[y])\n\t"
"vst %%v24,64(%%r1,%[y])\n\t"
"vst %%v25,80(%%r1,%[y])\n\t"
"vst %%v26,96(%%r1,%[y])\n\t"
"vst %%v27,112(%%r1,%[y])\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,128(%%r1,%[y])\n\t"
"vl %%v21,144(%%r1,%[y])\n\t"
"vl %%v22,160(%%r1,%[y])\n\t"
"vl %%v23,176(%%r1,%[y])\n\t"
"vl %%v24,192(%%r1,%[x])\n\t"
"vl %%v25,208(%%r1,%[x])\n\t"
"vl %%v26,224(%%r1,%[x])\n\t"
"vl %%v27,240(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[y])\n\t"
"vl %%v29,208(%%r1,%[y])\n\t"
"vl %%v30,224(%%r1,%[y])\n\t"
"vl %%v31,240(%%r1,%[y])\n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
"vl %%v24,192(%%r1,%1) \n\t"
"vl %%v25,208(%%r1,%1) \n\t"
"vl %%v26,224(%%r1,%1) \n\t"
"vl %%v27,240(%%r1,%1) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,128(%%r1,%2) \n\t"
"vst %%v17,144(%%r1,%2) \n\t"
"vst %%v18,160(%%r1,%2) \n\t"
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"
"vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,128(%%r1,%[y])\n\t"
"vst %%v17,144(%%r1,%[y])\n\t"
"vst %%v18,160(%%r1,%[y])\n\t"
"vst %%v19,176(%%r1,%[y])\n\t"
"vst %%v24,192(%%r1,%[y])\n\t"
"vst %%v25,208(%%r1,%[y])\n\t"
"vst %%v26,224(%%r1,%[y])\n\t"
"vst %%v27,240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
[alpha] "Q"(*alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
if ( n <= 0 ) return 0 ;
if (n <= 0)
return 0;
if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -32;
@ -135,8 +124,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
daxpy_kernel_32(n1, x, y, &da);
i = n1;
while(i < n)
{
while (i < n) {
y[i] += da * x[i];
i++;
@ -144,13 +132,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
return 0;
}
BLASLONG n1 = n & -4;
while(i < n1)
{
while (i < n1) {
FLOAT m1 = da * x[ix];
FLOAT m2 = da * x[ix + inc_x];
@ -168,8 +154,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
while(i < n)
{
while (i < n) {
y[iy] += da * x[ix];
ix += inc_x;
@ -180,5 +165,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
return 0;
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,30 +27,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,5 \n\t"
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],5\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","r2"
);
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x)
: "cc");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
if (n <= 0) return 0;
if (n <= 0)
return 0;
if ((inc_x == 1) && (inc_y == 1)) {
@ -66,7 +62,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
}
} else {
while (i < n) {
@ -81,5 +76,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
}
return 0;
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,68 +27,78 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
FLOAT dot;
__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,4 \n\t"
__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b \n\t"
"brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t"
"ldr %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ldr %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return dot;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT dot = 0.0;
if ( n <= 0 ) return(dot);
if (n <= 0)
return (dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
@ -96,8 +106,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
dot = ddot_kernel_16(n1, x, y);
i = n1;
while(i < n)
{
while (i < n) {
dot += y[i] * x[i];
i++;
@ -105,7 +114,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
}
return (dot);
}
FLOAT temp1 = 0.0;
@ -113,8 +121,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
BLASLONG n1 = n & -4;
while(i < n1)
{
while (i < n1) {
FLOAT m1 = y[iy] * x[ix];
FLOAT m2 = y[iy + inc_y] * x[ix + inc_x];
@ -132,8 +139,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
}
while(i < n)
{
while (i < n) {
temp1 += y[iy] * x[ix];
ix += inc_x;
@ -145,5 +151,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
return (dot);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -29,387 +29,349 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NBMAX 2048
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepg %%v0,0(%5) \n\t"
"vlrepg %%v1,8(%5) \n\t"
"vlrepg %%v2,16(%5) \n\t"
"vlrepg %%v3,24(%5) \n\t"
"vlrepg %%v4,%7 \n\t"
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vlrepg %%v0,0(%[x])\n\t"
"vlrepg %%v1,8(%[x])\n\t"
"vlrepg %%v2,16(%[x])\n\t"
"vlrepg %%v3,24(%[x])\n\t"
"vlrepg %%v4,%[alpha]\n\t"
"vfmdb %%v0,%%v0,%%v4\n\t"
"vfmdb %%v1,%%v1,%%v4\n\t"
"vfmdb %%v2,%%v2,%%v4\n\t"
"vfmdb %%v3,%%v3,%%v4\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%0 \n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 2,1024(%%r1,%6) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v20,16(%%r1,%1) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,16(%%r1,%3) \n\t"
"vl %%v23,16(%%r1,%4) \n\t"
"vl %%v24,32(%%r1,%1) \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vl %%v28,48(%%r1,%1) \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vl %%v31,48(%%r1,%4) \n\t"
"vl %%v4,0(%%r1,%6) \n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v20,16(%%r1,%[ap0])\n\t"
"vl %%v21,16(%%r1,%[ap1])\n\t"
"vl %%v22,16(%%r1,%[ap2])\n\t"
"vl %%v23,16(%%r1,%[ap3])\n\t"
"vl %%v24,32(%%r1,%[ap0])\n\t"
"vl %%v25,32(%%r1,%[ap1])\n\t"
"vl %%v26,32(%%r1,%[ap2])\n\t"
"vl %%v27,32(%%r1,%[ap3])\n\t"
"vl %%v28,48(%%r1,%[ap0])\n\t"
"vl %%v29,48(%%r1,%[ap1])\n\t"
"vl %%v30,48(%%r1,%[ap2])\n\t"
"vl %%v31,48(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vl %%v5,16(%%r1,%[y])\n\t"
"vl %%v6,32(%%r1,%[y])\n\t"
"vl %%v7,48(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,0(%%r1,%6) \n\t"
"vl %%v4,16(%%r1,%6) \n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,16(%%r1,%6) \n\t"
"vl %%v4,32(%%r1,%6) \n\t"
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,32(%%r1,%6) \n\t"
"vl %%v4,48(%%r1,%6) \n\t"
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,48(%%r1,%6) \n\t"
"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,64(%%r1,%2) \n\t"
"vl %%v18,64(%%r1,%3) \n\t"
"vl %%v19,64(%%r1,%4) \n\t"
"vl %%v20,80(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,80(%%r1,%3) \n\t"
"vl %%v23,80(%%r1,%4) \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
"vl %%v4,64(%%r1,%6) \n\t"
"vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
"vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
"vst %%v4,0(%%r1,%[y])\n\t"
"vst %%v5,16(%%r1,%[y])\n\t"
"vst %%v6,32(%%r1,%[y])\n\t"
"vst %%v7,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[ap0])\n\t"
"vl %%v17,64(%%r1,%[ap1])\n\t"
"vl %%v18,64(%%r1,%[ap2])\n\t"
"vl %%v19,64(%%r1,%[ap3])\n\t"
"vl %%v20,80(%%r1,%[ap0])\n\t"
"vl %%v21,80(%%r1,%[ap1])\n\t"
"vl %%v22,80(%%r1,%[ap2])\n\t"
"vl %%v23,80(%%r1,%[ap3])\n\t"
"vl %%v24,96(%%r1,%[ap0])\n\t"
"vl %%v25,96(%%r1,%[ap1])\n\t"
"vl %%v26,96(%%r1,%[ap2])\n\t"
"vl %%v27,96(%%r1,%[ap3])\n\t"
"vl %%v28,112(%%r1,%[ap0])\n\t"
"vl %%v29,112(%%r1,%[ap1])\n\t"
"vl %%v30,112(%%r1,%[ap2])\n\t"
"vl %%v31,112(%%r1,%[ap3])\n\t"
"vl %%v4,64(%%r1,%[y])\n\t"
"vl %%v5,80(%%r1,%[y])\n\t"
"vl %%v6,96(%%r1,%[y])\n\t"
"vl %%v7,112(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,64(%%r1,%6) \n\t"
"vl %%v4,80(%%r1,%6) \n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,80(%%r1,%6) \n\t"
"vl %%v4,96(%%r1,%6) \n\t"
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,96(%%r1,%6) \n\t"
"vl %%v4,112(%%r1,%6) \n\t"
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,112(%%r1,%6) \n\t"
"vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
"vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
"vst %%v4,64(%%r1,%[y])\n\t"
"vst %%v5,80(%%r1,%[y])\n\t"
"vst %%v6,96(%%r1,%[y])\n\t"
"vst %%v7,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%0 \n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v20,16(%%r1,%1) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,16(%%r1,%3) \n\t"
"vl %%v23,16(%%r1,%4) \n\t"
"vl %%v4,0(%%r1,%6) \n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v20,16(%%r1,%[ap0])\n\t"
"vl %%v21,16(%%r1,%[ap1])\n\t"
"vl %%v22,16(%%r1,%[ap2])\n\t"
"vl %%v23,16(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vl %%v5,16(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,0(%%r1,%6) \n\t"
"vl %%v4,16(%%r1,%6) \n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,16(%%r1,%6) \n\t"
"vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
"vst %%v4,0(%%r1,%[y])\n\t"
"vst %%v5,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: "+m"(*(struct { FLOAT x[n]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepg %%v0,0(%3) \n\t"
"vlrepg %%v1,8(%3) \n\t"
"vlrepg %%v2,%5 \n\t"
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vlrepg %%v0,0(%[x])\n\t"
"vlrepg %%v1,8(%[x])\n\t"
"vlrepg %%v2,%[alpha]\n\t"
"vfmdb %%v0,%%v0,%%v2\n\t"
"vfmdb %%v1,%%v1,%%v2\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%0 \n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%4) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,16(%%r1,%1) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"vl %%v20,32(%%r1,%1) \n\t"
"vl %%v21,32(%%r1,%2) \n\t"
"vl %%v22,48(%%r1,%1) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vl %%v26,80(%%r1,%1) \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vl %%v28,96(%%r1,%1) \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%1) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vl %%v2,0(%%r1,%4) \n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,16(%%r1,%[ap0])\n\t"
"vl %%v19,16(%%r1,%[ap1])\n\t"
"vl %%v20,32(%%r1,%[ap0])\n\t"
"vl %%v21,32(%%r1,%[ap1])\n\t"
"vl %%v22,48(%%r1,%[ap0])\n\t"
"vl %%v23,48(%%r1,%[ap1])\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vl %%v26,80(%%r1,%[ap0])\n\t"
"vl %%v27,80(%%r1,%[ap1])\n\t"
"vl %%v28,96(%%r1,%[ap0])\n\t"
"vl %%v29,96(%%r1,%[ap1])\n\t"
"vl %%v30,112(%%r1,%[ap0])\n\t"
"vl %%v31,112(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vl %%v3,16(%%r1,%[y])\n\t"
"vl %%v4,32(%%r1,%[y])\n\t"
"vl %%v5,48(%%r1,%[y])\n\t"
"vl %%v6,64(%%r1,%[y])\n\t"
"vl %%v7,80(%%r1,%[y])\n\t"
"vl %%v8,96(%%r1,%[y])\n\t"
"vl %%v9,112(%%r1,%[y])\n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v0,%%v5\n\t"
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmadb %%v7,%%v26,%%v0,%%v7\n\t"
"vfmadb %%v8,%%v28,%%v0,%%v8\n\t"
"vfmadb %%v9,%%v30,%%v0,%%v9\n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
"vst %%v2,0(%%r1,%4) \n\t"
"vl %%v2,16(%%r1,%4) \n\t"
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
"vst %%v2,16(%%r1,%4) \n\t"
"vl %%v2,32(%%r1,%4) \n\t"
"vfmadb %%v2,%%v20,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v21,%%v1,%%v2 \n\t"
"vst %%v2,32(%%r1,%4) \n\t"
"vl %%v2,48(%%r1,%4) \n\t"
"vfmadb %%v2,%%v22,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v23,%%v1,%%v2 \n\t"
"vst %%v2,48(%%r1,%4) \n\t"
"vl %%v2,64(%%r1,%4) \n\t"
"vfmadb %%v2,%%v24,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v25,%%v1,%%v2 \n\t"
"vst %%v2,64(%%r1,%4) \n\t"
"vl %%v2,80(%%r1,%4) \n\t"
"vfmadb %%v2,%%v26,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v27,%%v1,%%v2 \n\t"
"vst %%v2,80(%%r1,%4) \n\t"
"vl %%v2,96(%%r1,%4) \n\t"
"vfmadb %%v2,%%v28,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v29,%%v1,%%v2 \n\t"
"vst %%v2,96(%%r1,%4) \n\t"
"vl %%v2,112(%%r1,%4) \n\t"
"vfmadb %%v2,%%v30,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v31,%%v1,%%v2 \n\t"
"vst %%v2,112(%%r1,%4) \n\t"
"vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v23,%%v1,%%v5\n\t"
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmadb %%v7,%%v27,%%v1,%%v7\n\t"
"vfmadb %%v8,%%v29,%%v1,%%v8\n\t"
"vfmadb %%v9,%%v31,%%v1,%%v9\n\t"
"vst %%v2,0(%%r1,%[y])\n\t"
"vst %%v3,16(%%r1,%[y])\n\t"
"vst %%v4,32(%%r1,%[y])\n\t"
"vst %%v5,48(%%r1,%[y])\n\t"
"vst %%v6,64(%%r1,%[y])\n\t"
"vst %%v7,80(%%r1,%[y])\n\t"
"vst %%v8,96(%%r1,%[y])\n\t"
"vst %%v9,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%0 \n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,16(%%r1,%1) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"vl %%v2,0(%%r1,%4) \n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,16(%%r1,%[ap0])\n\t"
"vl %%v19,16(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vl %%v3,16(%%r1,%[y])\n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
"vst %%v2,0(%%r1,%4) \n\t"
"vl %%v2,16(%%r1,%4) \n\t"
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
"vst %%v2,16(%%r1,%4) \n\t"
"vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
"vst %%v2,0(%%r1,%[y])\n\t"
"vst %%v3,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: "+m"(*(struct { FLOAT x[n]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepg %%v0,0(%2) \n\t"
"vlrepg %%v1,%4 \n\t"
"vfmdb %%v0,%%v0,%%v1 \n\t"
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
__asm__("vlrepg %%v0,0(%[x])\n\t"
"vlrepg %%v16,%[alpha]\n\t"
"vfmdb %%v0,%%v0,%%v16\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t"
"ngr %%r0,%0 \n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%1) \n\t"
"vl %%v22,96(%%r1,%1) \n\t"
"vl %%v23,112(%%r1,%1) \n\t"
"vl %%v1,0(%%r1,%3) \n\t"
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"
"vl %%v1,16(%%r1,%3) \n\t"
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
"vst %%v1,16(%%r1,%3) \n\t"
"vl %%v1,32(%%r1,%3) \n\t"
"vfmadb %%v1,%%v18,%%v0,%%v1 \n\t"
"vst %%v1,32(%%r1,%3) \n\t"
"vl %%v1,48(%%r1,%3) \n\t"
"vfmadb %%v1,%%v19,%%v0,%%v1 \n\t"
"vst %%v1,48(%%r1,%3) \n\t"
"vl %%v1,64(%%r1,%3) \n\t"
"vfmadb %%v1,%%v20,%%v0,%%v1 \n\t"
"vst %%v1,64(%%r1,%3) \n\t"
"vl %%v1,80(%%r1,%3) \n\t"
"vfmadb %%v1,%%v21,%%v0,%%v1 \n\t"
"vst %%v1,80(%%r1,%3) \n\t"
"vl %%v1,96(%%r1,%3) \n\t"
"vfmadb %%v1,%%v22,%%v0,%%v1 \n\t"
"vst %%v1,96(%%r1,%3) \n\t"
"vl %%v1,112(%%r1,%3) \n\t"
"vfmadb %%v1,%%v23,%%v0,%%v1 \n\t"
"vst %%v1,112(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%[a0])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,16(%%r1,%[a0])\n\t"
"vl %%v18,32(%%r1,%[a0])\n\t"
"vl %%v19,48(%%r1,%[a0])\n\t"
"vl %%v20,64(%%r1,%[a0])\n\t"
"vl %%v21,80(%%r1,%[a0])\n\t"
"vl %%v22,96(%%r1,%[a0])\n\t"
"vl %%v23,112(%%r1,%[a0])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
"vfmadb %%v26,%%v18,%%v0,%%v26\n\t"
"vfmadb %%v27,%%v19,%%v0,%%v27\n\t"
"vfmadb %%v28,%%v20,%%v0,%%v28\n\t"
"vfmadb %%v29,%%v21,%%v0,%%v29\n\t"
"vfmadb %%v30,%%v22,%%v0,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v0,%%v31\n\t"
"vst %%v24,0(%%r1,%[y])\n\t"
"vst %%v25,16(%%r1,%[y])\n\t"
"vst %%v26,32(%%r1,%[y])\n\t"
"vst %%v27,48(%%r1,%[y])\n\t"
"vst %%v28,64(%%r1,%[y])\n\t"
"vst %%v29,80(%%r1,%[y])\n\t"
"vst %%v30,96(%%r1,%[y])\n\t"
"vst %%v31,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,12\n\t"
"ngr %%r0,%0 \n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v1,0(%%r1,%3) \n\t"
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"
"vl %%v1,16(%%r1,%3) \n\t"
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
"vst %%v1,16(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,16(%%r1,%[a0])\n\t"
"vl %%v18,0(%%r1,%[y])\n\t"
"vl %%v19,16(%%r1,%[y])\n\t"
"vfmadb %%v18,%%v16,%%v0,%%v18\n\t"
"vfmadb %%v19,%%v17,%%v0,%%v19\n\t"
"vst %%v18,0(%%r1,%[y])\n\t"
"vst %%v19,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: "+m"(*(struct { FLOAT x[n]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
"m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) {
BLASLONG i;
for (i = 0; i < n; i++)
{
for (i = 0; i < n; i++) {
*dest += src[i];
dest += inc_dest;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *buffer) {
BLASLONG i;
FLOAT *a_ptr;
FLOAT *x_ptr;
@ -423,8 +385,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG lda4 = lda << 2;
FLOAT xbuffer[8], *ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
if (m < 1)
return (0);
if (n < 1)
return (0);
ybuffer = buffer;
@ -439,13 +403,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
while (NB == NBMAX) {
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
if (m1 < 0) {
if (m2 == 0)
break;
NB = m2;
}
@ -462,12 +425,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
else
ybuffer = y_ptr;
if ( inc_x == 1 )
{
if (inc_x == 1) {
for( i = 0; i < n1 ; i++)
{
for (i = 0; i < n1; i++) {
dgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha);
ap[0] += lda4;
ap[1] += lda4;
@ -477,29 +437,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr += 4;
}
if ( n2 & 2 )
{
if (n2 & 2) {
dgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha);
a_ptr += lda * 2;
x_ptr += 2;
}
if ( n2 & 1 )
{
if (n2 & 1) {
dgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha);
/* a_ptr += lda;
x_ptr += 1; */
}
} else {
}
else
{
for( i = 0; i < n1 ; i++)
{
for (i = 0; i < n1; i++) {
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
@ -516,8 +469,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
for (i = 0; i < n2; i++) {
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha);
@ -528,30 +480,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
}
a += NB;
if ( inc_y != 1 )
{
if (inc_y != 1) {
add_y(NB, ybuffer, y_ptr, inc_y);
y_ptr += NB * inc_y;
}
else
} else
y_ptr += NB;
}
if ( m3 == 0 ) return(0);
if (m3 == 0)
return (0);
if ( m3 == 3 )
{
if (m3 == 3) {
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 )
{
if (lda == 3 && inc_x == 1) {
for( i = 0; i < ( n & -4 ); i+=4 )
{
for (i = 0; i < (n & -4); i += 4) {
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
@ -565,8 +513,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr += 4;
}
for( ; i < n; i++ )
{
for (; i < n; i++) {
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
@ -574,19 +521,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr++;
}
}
else
{
} else {
for( i = 0; i < n; i++ )
{
for (i = 0; i < n; i++) {
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
@ -598,18 +541,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return (0);
}
if ( m3 == 2 )
{
if (m3 == 2) {
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{
if (lda == 2 && inc_x == 1) {
for( i = 0; i < (n & -4) ; i+=4 )
{
for (i = 0; i < (n & -4); i += 4) {
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
@ -619,27 +558,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
}
for( ; i < n; i++ )
{
for (; i < n; i++) {
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr++;
}
}
else
{
} else {
for( i = 0; i < n; i++ )
{
for (i = 0; i < n; i++) {
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
@ -649,31 +582,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return (0);
}
if ( m3 == 1 )
{
if (m3 == 1) {
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{
if (lda == 1 && inc_x == 1) {
for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
for (i = 0; i < (n & -4); i += 4) {
temp +=
a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i +
2] *
x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3];
}
for( ; i < n; i++ )
{
for (; i < n; i++) {
temp += a_ptr[i] * x_ptr[i];
}
}
else
{
} else {
for( i = 0; i < n; i++ )
{
for (i = 0; i < n; i++) {
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
@ -684,8 +613,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return (0);
}
return (0);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,34 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
{
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT max;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
"vfmaxdb %%v17,%%v17,%%v25,0\n\t"
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
@ -63,29 +59,23 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
"vfmaxdb %%v21,%%v21,%%v29,0\n\t"
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
"vfmaxdb %%v23,%%v23,%%v31,0\n\t"
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
"vfmaxdb %%v17,%%v17,%%v21,0\n\t"
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
"vfmaxdb %%v19,%%v19,%%v23,0\n\t"
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
"vfmaxdb %%v17,%%v17,%%v19,0\n\t"
"vfmaxdb %%v16,%%v16,%%v17,0\n\t"
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
"ldr %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ldr %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return max;
}
@ -95,7 +85,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) {
@ -105,9 +96,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = dmax_kernel_32(n1, x);
i = n1;
}
else
{
} else {
maxf = x[0];
i++;
}
@ -146,7 +135,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,26 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
{
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT max;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
@ -55,27 +51,22 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
@ -84,29 +75,24 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ldr %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return max;
}
@ -116,7 +102,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) {
@ -126,9 +113,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = dmax_kernel_32(n1, x);
i = n1;
}
else
{
} else {
maxf = x[0];
i++;
}
@ -167,7 +152,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,34 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
{
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT min;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmindb %%v16,%%v16,%%v24,0\n\t"
"vfmindb %%v17,%%v17,%%v25,0\n\t"
"vfmindb %%v18,%%v18,%%v26,0\n\t"
@ -63,29 +59,23 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
"vfmindb %%v21,%%v21,%%v29,0\n\t"
"vfmindb %%v22,%%v22,%%v30,0\n\t"
"vfmindb %%v23,%%v23,%%v31,0\n\t"
"vfmindb %%v16,%%v16,%%v20,0\n\t"
"vfmindb %%v17,%%v17,%%v21,0\n\t"
"vfmindb %%v18,%%v18,%%v22,0\n\t"
"vfmindb %%v19,%%v19,%%v23,0\n\t"
"vfmindb %%v16,%%v16,%%v18,0\n\t"
"vfmindb %%v17,%%v17,%%v19,0\n\t"
"vfmindb %%v16,%%v16,%%v17,0\n\t"
"vfmindb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,0\n\t"
"ldr %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ldr %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return min;
}
@ -95,7 +85,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) {
@ -105,9 +96,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = dmin_kernel_32(n1, x);
i = n1;
}
else
{
} else {
minf = x[0];
i++;
}
@ -146,7 +135,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,26 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
{
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT min;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
@ -55,27 +51,22 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
@ -84,29 +75,24 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ldr %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return min;
}
@ -116,7 +102,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) {
@ -126,9 +113,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = dmin_kernel_32(n1, x);
i = n1;
}
else
{
} else {
minf = x[0];
i++;
}
@ -167,7 +152,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,25 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepg %%v0,%3 \n\t"
"vlrepg %%v1,%4 \n\t"
"srlg %%r0,%0,5 \n\t"
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
__asm__("vlrepg %%v0,%[c]\n\t"
"vlrepg %%v1,%[s]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -63,25 +60,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -99,25 +93,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -135,25 +126,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -171,39 +159,38 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT c, FLOAT s) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp;
if ( n <= 0 ) return(0);
if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
if (n1 > 0) {
FLOAT cosa, sina;
cosa = c;
sina = s;
@ -211,8 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
i = n1;
}
while(i < n)
{
while (i < n) {
temp = c * x[i] + s * y[i];
y[i] = c * y[i] - s * x[i];
x[i] = temp;
@ -221,13 +207,9 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
}
} else {
}
else
{
while(i < n)
{
while (i < n) {
temp = c * x[ix] + s * y[iy];
y[iy] = c * y[iy] - s * x[ix];
x[ix] = temp;
@ -242,5 +224,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
return (0);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,128 +27,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x)
{
__asm__ volatile (
"vlrepg %%v0,%1 \n\t"
"srlg %%r0,%0,4 \n\t"
static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) {
__asm__("vlrepg %%v0,%[da]\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[x])\n\t"
"vfmdb %%v24,%%v24,%%v0\n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vst %%v24,0(%%r1,%[x])\n\t"
"vl %%v25,16(%%r1,%[x])\n\t"
"vfmdb %%v25,%%v25,%%v0\n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vst %%v25,16(%%r1,%[x])\n\t"
"vl %%v26,32(%%r1,%[x])\n\t"
"vfmdb %%v26,%%v26,%%v0\n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vst %%v26,32(%%r1,%[x])\n\t"
"vl %%v27,48(%%r1,%[x])\n\t"
"vfmdb %%v27,%%v27,%%v0\n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%2) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%2) \n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%2) \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vl %%v27, 112(%%r1,%2) \n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"vst %%v27,48(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[x])\n\t"
"vfmdb %%v28,%%v28,%%v0\n\t"
"vst %%v28,64(%%r1,%[x])\n\t"
"vl %%v29,80(%%r1,%[x])\n\t"
"vfmdb %%v29,%%v29,%%v0\n\t"
"vst %%v29,80(%%r1,%[x])\n\t"
"vl %%v30,96(%%r1,%[x])\n\t"
"vfmdb %%v30,%%v30,%%v0\n\t"
"vst %%v30,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vfmdb %%v31,%%v31,%%v0\n\t"
"vst %%v31,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
: [x] "a"(x),[da] "Q"(da)
: "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t"
static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
__asm__("vzero %%v0\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
if (n <= 0 || inc_x <= 0)
return (0);
if (inc_x == 1) {
if ( inc_x == 1 )
{
if ( da == 0.0 )
{
if (da == 0.0) {
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
if (n1 > 0) {
dscal_kernel_16_zero(n1, x);
j = n1;
}
while(j < n)
{
while (j < n) {
x[j] = 0.0;
j++;
}
}
else
{
} else {
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
if (n1 > 0) {
dscal_kernel_16(n1, da, x);
j = n1;
}
while(j < n)
{
while (j < n) {
x[j] = da * x[j];
j++;
}
}
} else {
}
else
{
if ( da == 0.0 )
{
if (da == 0.0) {
BLASLONG n1 = n & -4;
@ -163,17 +141,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
j += 4;
}
while(j < n)
{
while (j < n) {
x[i] = 0.0;
i += inc_x;
j++;
}
}
else
{
} else {
BLASLONG n1 = n & -4;
while (j < n1) {
@ -188,8 +163,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
while(j < n)
{
while (j < n) {
x[i] = da * x[i];
i += inc_x;
@ -201,5 +175,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
return 0;
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project
Copyright (c) 2013-2019,The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are
@ -27,35 +27,38 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
double dot;
__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,4 \n\t"
__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"vlef %%v16,0(%%r1,%2),0 \n\t"
"vlef %%v16,4(%%r1,%2),2 \n\t"
"vlef %%v17,8(%%r1,%2),0 \n\t"
"vlef %%v17,12(%%r1,%2),2 \n\t"
"vlef %%v18,16(%%r1,%2),0 \n\t"
"vlef %%v18,20(%%r1,%2),2 \n\t"
"vlef %%v19,24(%%r1,%2),0 \n\t"
"vlef %%v19,28(%%r1,%2),2 \n\t"
"vlef %%v20,32(%%r1,%2),0 \n\t"
"vlef %%v20,36(%%r1,%2),2 \n\t"
"vlef %%v21,40(%%r1,%2),0 \n\t"
"vlef %%v21,44(%%r1,%2),2 \n\t"
"vlef %%v22,48(%%r1,%2),0 \n\t"
"vlef %%v22,52(%%r1,%2),2 \n\t"
"vlef %%v23,56(%%r1,%2),0 \n\t"
"vlef %%v23,60(%%r1,%2),2 \n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vlef %%v16,0(%%r1,%[x]),0\n\t"
"vlef %%v16,4(%%r1,%[x]),2\n\t"
"vlef %%v17,8(%%r1,%[x]),0\n\t"
"vlef %%v17,12(%%r1,%[x]),2\n\t"
"vlef %%v18,16(%%r1,%[x]),0\n\t"
"vlef %%v18,20(%%r1,%[x]),2\n\t"
"vlef %%v19,24(%%r1,%[x]),0\n\t"
"vlef %%v19,28(%%r1,%[x]),2\n\t"
"vlef %%v20,32(%%r1,%[x]),0\n\t"
"vlef %%v20,36(%%r1,%[x]),2\n\t"
"vlef %%v21,40(%%r1,%[x]),0\n\t"
"vlef %%v21,44(%%r1,%[x]),2\n\t"
"vlef %%v22,48(%%r1,%[x]),0\n\t"
"vlef %%v22,52(%%r1,%[x]),2\n\t"
"vlef %%v23,56(%%r1,%[x]),0\n\t"
"vlef %%v23,60(%%r1,%[x]),2\n\t"
"vflls %%v16,%%v16\n\t"
"vflls %%v17,%%v17\n\t"
"vflls %%v18,%%v18\n\t"
@ -64,64 +67,70 @@ static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
"vflls %%v21,%%v21\n\t"
"vflls %%v22,%%v22\n\t"
"vflls %%v23,%%v23\n\t"
"vlef %%v24,0(%%r1,%3),0 \n\t"
"vlef %%v24,4(%%r1,%3),2 \n\t"
"vlef %%v24,0(%%r1,%[y]),0\n\t"
"vlef %%v24,4(%%r1,%[y]),2\n\t"
"vflls %%v24,%%v24\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vlef %%v25,8(%%r1,%3),0 \n\t"
"vlef %%v25,12(%%r1,%3),2 \n\t"
"vlef %%v25,8(%%r1,%[y]),0\n\t"
"vlef %%v25,12(%%r1,%[y]),2\n\t"
"vflls %%v25,%%v25\n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"vlef %%v26,16(%%r1,%3),0 \n\t"
"vlef %%v26,20(%%r1,%3),2 \n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vlef %%v26,16(%%r1,%[y]),0\n\t"
"vlef %%v26,20(%%r1,%[y]),2\n\t"
"vflls %%v26,%%v26\n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
"vlef %%v27,24(%%r1,%3),0 \n\t"
"vlef %%v27,28(%%r1,%3),2 \n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vlef %%v27,24(%%r1,%[y]),0\n\t"
"vlef %%v27,28(%%r1,%[y]),2\n\t"
"vflls %%v27,%%v27\n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
"vlef %%v28,32(%%r1,%3),0 \n\t"
"vlef %%v28,36(%%r1,%3),2 \n\t"
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vlef %%v28,32(%%r1,%[y]),0\n\t"
"vlef %%v28,36(%%r1,%[y]),2\n\t"
"vflls %%v28,%%v28\n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
"vlef %%v29,40(%%r1,%3),0 \n\t"
"vlef %%v29,44(%%r1,%3),2 \n\t"
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vlef %%v29,40(%%r1,%[y]),0\n\t"
"vlef %%v29,44(%%r1,%[y]),2\n\t"
"vflls %%v29,%%v29\n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
"vlef %%v30,48(%%r1,%3),0 \n\t"
"vlef %%v30,52(%%r1,%3),2 \n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vlef %%v30,48(%%r1,%[y]),0\n\t"
"vlef %%v30,52(%%r1,%[y]),2\n\t"
"vflls %%v30,%%v30\n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
"vlef %%v31,56(%%r1,%3),0 \n\t"
"vlef %%v31,60(%%r1,%3),2 \n\t"
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vlef %%v31,56(%%r1,%[y]),0\n\t"
"vlef %%v31,60(%%r1,%[y]),2\n\t"
"vflls %%v31,%%v31\n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,64\n\t"
"brctg %%r0,0b \n\t"
"brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t"
"ldr %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ldr %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return dot;
}
double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
{
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
double dot = 0.0;
if ( n <= 0 ) return(dot);
if (n <= 0)
return (dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
@ -129,8 +138,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
dot = dsdot_kernel_16(n1, x, y);
i = n1;
while(i < n)
{
while (i < n) {
dot += (double) y[i] * (double) x[i];
i++;
@ -138,13 +146,11 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
}
return (dot);
}
BLASLONG n1 = n & -2;
while(i < n1)
{
while (i < n1) {
dot += (double) y[iy] * (double) x[ix];
dot += (double) y[iy + inc_y] * (double) x[ix + inc_x];
@ -154,8 +160,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
}
while(i < n)
{
while (i < n) {
dot += (double) y[iy] * (double) x[ix];
ix += inc_x;
@ -166,5 +171,3 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
return (dot);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,111 +27,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,5 \n\t"
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp;
if ( n <= 0 ) return(0);
if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
if (n1 > 0) {
dswap_kernel_32(n1, x, y);
i = n1;
}
while(i < n)
{
while (i < n) {
temp = y[i];
y[i] = x[i];
x[i] = temp;
@ -139,13 +133,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
}
} else {
}
else
{
while(i < n)
{
while (i < n) {
temp = y[iy];
y[iy] = x[ix];
x[ix] = temp;
@ -158,5 +148,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
}
return (0);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,26 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
{
static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
BLASLONG iamax;
__asm__ volatile (
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v0,8(%3),1 \n\t"
"vlef %%v1,12(%3),1 \n\t"
"vlef %%v0,16(%3),2 \n\t"
"vlef %%v1,20(%3),2 \n\t"
"vlef %%v0,24(%3),3 \n\t"
"vlef %%v1,28(%3),3 \n\t"
__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v1,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v1,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v1,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v1,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v1,%%v1\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
@ -89,31 +82,26 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"srlg %%r0,%2,5 \n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v28,16(%%r1,%3) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v28,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v29,48(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v29,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v30,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
@ -126,14 +114,12 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -141,7 +127,6 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
@ -150,27 +135,22 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v28,144(%%r1,%3) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v28,144(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v29,176(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v29,176(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v30,208(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v30,208(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v31,240(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
@ -183,14 +163,12 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -198,7 +176,6 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
@ -207,10 +184,8 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
@ -221,14 +196,13 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%1,0 \n\t"
"vstef %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
@ -236,27 +210,28 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v25", "v26", "v27", "v28", "v29", "v30", "v31");
return iamax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0;
BLASLONG max = 0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(max);
if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) {
@ -266,18 +241,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
max = icamax_kernel_32(n1, x, &maxf);
ix = n1 * 2;
i = n1;
}
else
{
} else {
maxf = CABS1(x, 0);
ix += 2;
i++;
}
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
@ -291,13 +262,35 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
max = 0;
maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) > maxf) {
max = i + 1;
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) > maxf) {
max = i + 2;
maxf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) > maxf) {
max = i + 3;
maxf = CABS1(x, ix + 3 * inc_x2);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
@ -307,5 +300,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
return (max + 1);
}
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,26 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
{
static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
BLASLONG iamin;
__asm__ volatile (
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v0,8(%3),1 \n\t"
"vlef %%v1,12(%3),1 \n\t"
"vlef %%v0,16(%3),2 \n\t"
"vlef %%v1,20(%3),2 \n\t"
"vlef %%v0,24(%3),3 \n\t"
"vlef %%v1,28(%3),3 \n\t"
__asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v1,4(%[x]),0\n\t"
"vlef %%v0,8(%[x]),1\n\t"
"vlef %%v1,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v1,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v1,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v1,%%v1\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
@ -89,31 +82,26 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"srlg %%r0,%2,5 \n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v28,16(%%r1,%3) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v28,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v29,48(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v29,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v30,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
@ -126,14 +114,12 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -141,7 +127,6 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
@ -150,27 +135,22 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v28,144(%%r1,%3) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v28,144(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v29,176(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v29,176(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v30,208(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v30,208(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v31,240(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
@ -183,14 +163,12 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -198,7 +176,6 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
@ -207,10 +184,8 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
@ -221,14 +196,13 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%1,0 \n\t"
"vstef %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
@ -236,27 +210,28 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v25", "v26", "v27", "v28", "v29", "v30", "v31");
return iamin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0;
BLASLONG min = 0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(min);
if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) {
@ -266,18 +241,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
min = icamin_kernel_32(n1, x, &minf);
ix = n1 * 2;
i = n1;
}
else
{
} else {
minf = CABS1(x, 0);
ix += 2;
i++;
}
while(i < n)
{
if( CABS1(x,ix) < minf )
{
while (i < n) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
@ -291,13 +262,35 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
min = 0;
minf = CABS1(x, 0);
inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) < minf )
{
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) < minf) {
min = i + 1;
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) < minf) {
min = i + 2;
minf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) < minf) {
min = i + 3;
minf = CABS1(x, ix + 3 * inc_x2);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
@ -307,5 +300,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
return (min + 1);
}
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,18 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
{
static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
BLASLONG iamax;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vflpdb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
@ -61,19 +55,18 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %%r0,%2,5 \n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -82,7 +75,6 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
@ -95,32 +87,28 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -129,7 +117,6 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
@ -142,47 +129,43 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%1,0 \n\t"
"vsteg %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return iamax;
}
@ -193,7 +176,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0;
BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max);
if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) {
@ -203,9 +187,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
max = idamax_kernel_32(n1, x, &maxf);
i = n1;
}
else
{
} else {
maxf = ABS(x[0]);
i++;
}
@ -250,7 +232,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,18 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
{
static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
BLASLONG iamin;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vflpdb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
@ -61,19 +55,18 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %%r0,%2,5 \n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -82,7 +75,6 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
@ -95,32 +87,28 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -129,7 +117,6 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
@ -142,47 +129,43 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%1,0 \n\t"
"vsteg %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return iamin;
}
@ -193,7 +176,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0;
BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min);
if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) {
@ -203,9 +187,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
min = idamin_kernel_32(n1, x, &minf);
i = n1;
}
else
{
} else {
minf = ABS(x[0]);
i++;
}
@ -250,7 +232,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (ABS(x[i]) < minf) {
min = j;

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,12 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
{
static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) {
BLASLONG imax;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
@ -53,20 +51,18 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %%r0,%2,5 \n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
@ -79,33 +75,28 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
@ -118,47 +109,43 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%1,0 \n\t"
"vsteg %%v0,%[max],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t"
"vlgvg %[imax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%[max]\n\t"
"vlgvg %[imax],%%v1,0\n\t"
"2:\n\t"
"nop"
:"=r"(imax),"=m"(*max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return imax;
}
@ -169,7 +156,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0;
BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max);
if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) {
@ -179,9 +167,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
max = idmax_kernel_32(n1, x, &maxf);
i = n1;
}
else
{
} else {
maxf = x[0];
i++;
}
@ -226,7 +212,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (x[i] > maxf) {
max = j;

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,12 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
{
static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) {
BLASLONG imin;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t"
@ -53,20 +51,18 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
"vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t"
"srlg %%r0,%2,5 \n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
@ -79,33 +75,28 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
@ -118,47 +109,43 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%1,0 \n\t"
"vsteg %%v0,%[min],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t"
"vlgvg %[imin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%[min]\n\t"
"vlgvg %[imin],%%v1,0\n\t"
"2:\n\t"
"nop"
:"=r"(imin),"=m"(*min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return imin;
}
@ -169,7 +156,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0;
BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min);
if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) {
@ -179,9 +167,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
min = idmin_kernel_32(n1, x, &minf);
i = n1;
}
else
{
} else {
minf = x[0];
i++;
}
@ -226,7 +212,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (x[i] < minf) {
min = j;

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,18 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
{
static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) {
BLASLONG iamax;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vflpsb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
@ -79,19 +73,18 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %%r0,%2,6 \n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
@ -100,7 +93,6 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
@ -113,14 +105,12 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -128,7 +118,6 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
@ -137,15 +126,14 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
@ -154,7 +142,6 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
@ -167,14 +154,12 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -182,7 +167,6 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
@ -191,10 +175,8 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
@ -205,14 +187,13 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%1,0 \n\t"
"vstef %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
@ -220,14 +201,15 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return iamax;
}
@ -238,7 +220,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0;
BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max);
if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) {
@ -248,9 +231,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
max = isamax_kernel_64(n1, x, &maxf);
i = n1;
}
else
{
} else {
maxf = ABS(x[0]);
i++;
}
@ -295,7 +276,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,18 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
{
static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) {
BLASLONG iamin;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vflpsb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
@ -79,19 +73,18 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %%r0,%2,6 \n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
@ -100,7 +93,6 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
@ -113,14 +105,12 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -128,7 +118,6 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
@ -137,15 +126,14 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
@ -154,7 +142,6 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
@ -167,14 +154,12 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -182,7 +167,6 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
@ -191,10 +175,8 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
@ -205,14 +187,13 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%1,0 \n\t"
"vstef %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
@ -220,14 +201,15 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return iamin;
}
@ -238,7 +220,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0;
BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min);
if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) {
@ -248,9 +231,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
min = isamin_kernel_64(n1, x, &minf);
i = n1;
}
else
{
} else {
minf = ABS(x[0]);
i++;
}
@ -295,7 +276,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (ABS(x[i]) < minf) {
min = j;

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,12 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
{
static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) {
BLASLONG imax;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
@ -71,20 +69,18 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %%r0,%2,6 \n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
@ -97,14 +93,12 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -112,7 +106,6 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
@ -121,16 +114,14 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
@ -143,14 +134,12 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -158,7 +147,6 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
@ -167,10 +155,8 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
@ -181,14 +167,13 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%1,0 \n\t"
"vstef %%v0,%[max],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t"
"vlgvg %[imax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
@ -196,14 +181,15 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%[max]\n\t"
"vlgvg %[imax],%%v1,0\n\t"
"2:\n\t"
"nop"
:"=r"(imax),"=m"(*max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return imax;
}
@ -214,7 +200,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0;
BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max);
if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) {
@ -224,9 +211,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
max = ismax_kernel_64(n1, x, &maxf);
i = n1;
}
else
{
} else {
maxf = x[0];
i++;
}
@ -271,7 +256,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (x[i] > maxf) {
max = j;

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,12 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
{
static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) {
BLASLONG imin;
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
@ -71,20 +69,18 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t"
"srlg %%r0,%2,6 \n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
@ -97,14 +93,12 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -112,7 +106,6 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
@ -121,16 +114,14 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
@ -143,14 +134,12 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -158,7 +147,6 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
@ -167,10 +155,8 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
@ -181,14 +167,13 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%1,0 \n\t"
"vstef %%v0,%[min],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t"
"vlgvg %[imin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
@ -196,14 +181,15 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%[min]\n\t"
"vlgvg %[imin],%%v1,0\n\t"
"2:\n\t"
"nop"
:"=r"(imin),"=m"(*min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return imin;
}
@ -214,7 +200,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0;
BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min);
if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) {
@ -224,9 +211,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
min = ismin_kernel_64(n1, x, &minf);
i = n1;
}
else
{
} else {
minf = x[0];
i++;
}
@ -271,7 +256,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (x[i] < minf) {
min = j;

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,22 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
{
static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) {
BLASLONG iamax;
__asm__ volatile (
"vleg %%v0,0(%3),0 \n\t"
"vleg %%v1,8(%3),0 \n\t"
"vleg %%v0,16(%3),1 \n\t"
"vleg %%v1,24(%3),1 \n\t"
__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v1,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v1,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v1,%%v1\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
@ -59,27 +52,26 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"srlg %%r0,%2,4 \n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vleg %%v16,0(%%r1,%3),0 \n\t"
"vleg %%v17,8(%%r1,%3),0 \n\t"
"vleg %%v16,16(%%r1,%3),1 \n\t"
"vleg %%v17,24(%%r1,%3),1 \n\t"
"vleg %%v18,32(%%r1,%3),0 \n\t"
"vleg %%v19,40(%%r1,%3),0 \n\t"
"vleg %%v18,48(%%r1,%3),1 \n\t"
"vleg %%v19,56(%%r1,%3),1 \n\t"
"vleg %%v20,64(%%r1,%3),0 \n\t"
"vleg %%v21,72(%%r1,%3),0 \n\t"
"vleg %%v20,80(%%r1,%3),1 \n\t"
"vleg %%v21,88(%%r1,%3),1 \n\t"
"vleg %%v22,96(%%r1,%3),0 \n\t"
"vleg %%v23,104(%%r1,%3),0 \n\t"
"vleg %%v22,112(%%r1,%3),1 \n\t"
"vleg %%v23,120(%%r1,%3),1 \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -92,40 +84,36 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vleg %%v16,128(%%r1,%3),0 \n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -138,60 +126,55 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%1,0 \n\t"
"vsteg %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
return iamax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0;
BLASLONG max = 0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(max);
if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) {
@ -201,18 +184,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
max = izamax_kernel_16(n1, x, &maxf);
ix = n1 * 2;
i = n1;
}
else
{
} else {
maxf = CABS1(x, 0);
ix += 2;
i++;
}
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
@ -226,13 +205,35 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
max = 0;
maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) > maxf) {
max = i + 1;
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) > maxf) {
max = i + 2;
maxf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) > maxf) {
max = i + 3;
maxf = CABS1(x, ix + 3 * inc_x2);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
@ -242,5 +243,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
return (max + 1);
}
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,22 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
{
static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) {
BLASLONG iamin;
__asm__ volatile (
"vleg %%v0,0(%3),0 \n\t"
"vleg %%v1,8(%3),0 \n\t"
"vleg %%v0,16(%3),1 \n\t"
"vleg %%v1,24(%3),1 \n\t"
__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v1,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v1,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v1,%%v1\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
@ -59,27 +52,26 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
"vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t"
"srlg %%r0,%2,4 \n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vleg %%v16,0(%%r1,%3),0 \n\t"
"vleg %%v17,8(%%r1,%3),0 \n\t"
"vleg %%v16,16(%%r1,%3),1 \n\t"
"vleg %%v17,24(%%r1,%3),1 \n\t"
"vleg %%v18,32(%%r1,%3),0 \n\t"
"vleg %%v19,40(%%r1,%3),0 \n\t"
"vleg %%v18,48(%%r1,%3),1 \n\t"
"vleg %%v19,56(%%r1,%3),1 \n\t"
"vleg %%v20,64(%%r1,%3),0 \n\t"
"vleg %%v21,72(%%r1,%3),0 \n\t"
"vleg %%v20,80(%%r1,%3),1 \n\t"
"vleg %%v21,88(%%r1,%3),1 \n\t"
"vleg %%v22,96(%%r1,%3),0 \n\t"
"vleg %%v23,104(%%r1,%3),0 \n\t"
"vleg %%v22,112(%%r1,%3),1 \n\t"
"vleg %%v23,120(%%r1,%3),1 \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -92,40 +84,36 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vleg %%v16,128(%%r1,%3),0 \n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -138,60 +126,55 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%1,0 \n\t"
"vsteg %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
return iamin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0;
BLASLONG min = 0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(min);
if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) {
@ -201,18 +184,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
min = izamin_kernel_16(n1, x, &minf);
ix = n1 * 2;
i = n1;
}
else
{
} else {
minf = CABS1(x, 0);
ix += 2;
i++;
}
while(i < n)
{
if( CABS1(x,ix) < minf )
{
while (i < n) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
@ -226,13 +205,35 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
min = 0;
minf = CABS1(x, 0);
inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) < minf )
{
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) < minf) {
min = i + 1;
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) < minf) {
min = i + 2;
minf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) < minf) {
min = i + 3;
minf = CABS1(x, ix + 3 * inc_x2);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
@ -242,5 +243,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
return (min + 1);
}
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,40 +28,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x)
{
static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT amax;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxsb %%v16,%%v16,%%v24,8\n\t"
"vfmaxsb %%v17,%%v17,%%v25,8\n\t"
"vfmaxsb %%v18,%%v18,%%v26,8\n\t"
@ -70,32 +62,25 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x)
"vfmaxsb %%v21,%%v21,%%v29,8\n\t"
"vfmaxsb %%v22,%%v22,%%v30,8\n\t"
"vfmaxsb %%v23,%%v23,%%v31,8\n\t"
"vfmaxsb %%v16,%%v16,%%v20,8\n\t"
"vfmaxsb %%v17,%%v17,%%v21,8\n\t"
"vfmaxsb %%v18,%%v18,%%v22,8\n\t"
"vfmaxsb %%v19,%%v19,%%v23,8\n\t"
"vfmaxsb %%v16,%%v16,%%v18,8\n\t"
"vfmaxsb %%v17,%%v17,%%v19,8\n\t"
"vfmaxsb %%v16,%%v16,%%v17,8\n\t"
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,8\n\t"
"lper %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"lper %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amax;
}
@ -105,7 +90,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) {
@ -115,9 +101,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = samax_kernel_64(n1, x);
i = n1;
}
else
{
} else {
maxf = ABS(x[0]);
i++;
}
@ -156,7 +140,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,40 +28,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x)
{
static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT amin;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfminsb %%v16,%%v16,%%v24,8\n\t"
"vfminsb %%v17,%%v17,%%v25,8\n\t"
"vfminsb %%v18,%%v18,%%v26,8\n\t"
@ -70,32 +62,25 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x)
"vfminsb %%v21,%%v21,%%v29,8\n\t"
"vfminsb %%v22,%%v22,%%v30,8\n\t"
"vfminsb %%v23,%%v23,%%v31,8\n\t"
"vfminsb %%v16,%%v16,%%v20,8\n\t"
"vfminsb %%v17,%%v17,%%v21,8\n\t"
"vfminsb %%v18,%%v18,%%v22,8\n\t"
"vfminsb %%v19,%%v19,%%v23,8\n\t"
"vfminsb %%v16,%%v16,%%v18,8\n\t"
"vfminsb %%v17,%%v17,%%v19,8\n\t"
"vfminsb %%v16,%%v16,%%v17,8\n\t"
"vfminsb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,8\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,8\n\t"
"lper %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"lper %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amin;
}
@ -105,7 +90,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) {
@ -115,9 +101,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = samin_kernel_64(n1, x);
i = n1;
}
else
{
} else {
minf = ABS(x[0]);
i++;
}
@ -156,7 +140,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,34 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
{
static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,6 \n\t"
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
@ -64,25 +61,22 @@ static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
@ -91,30 +85,32 @@ static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v2 \n\t"
"vfasb %%v0,%%v0,%%v3 \n\t"
"veslg %%v1,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"ler %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum;
}
@ -125,7 +121,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT sumf = 0.0;
BLASLONG n1;
if (n <= 0 || inc_x <= 0) return sumf;
if (n <= 0 || inc_x <= 0)
return sumf;
if (inc_x == 1) {
@ -166,9 +163,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
j++;
}
}
return sumf;
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,107 +27,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
"vlrepf %%v0,%3 \n\t"
"srlg %%r0,%0,6 \n\t"
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__("vlrepf %%v0,%[alpha]\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,0(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%[y])\n\t"
"vl %%v24,64(%%r1,%[x])\n\t"
"vl %%v25,80(%%r1,%[x])\n\t"
"vl %%v26,96(%%r1,%[x])\n\t"
"vl %%v27,112(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,80(%%r1,%1) \n\t"
"vl %%v26,96(%%r1,%1) \n\t"
"vl %%v27,112(%%r1,%1) \n\t"
"vl %%v28,64(%%r1,%2) \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vl %%v30,96(%%r1,%2) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vl %%v16,128(%%r1,%1) \n\t"
"vl %%v17,144(%%r1,%1) \n\t"
"vl %%v18,160(%%r1,%1) \n\t"
"vl %%v19,176(%%r1,%1) \n\t"
"vl %%v20,128(%%r1,%2) \n\t"
"vl %%v21,144(%%r1,%2) \n\t"
"vl %%v22,160(%%r1,%2) \n\t"
"vl %%v23,176(%%r1,%2) \n\t"
"vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,0(%%r1,%[y])\n\t"
"vst %%v17,16(%%r1,%[y])\n\t"
"vst %%v18,32(%%r1,%[y])\n\t"
"vst %%v19,48(%%r1,%[y])\n\t"
"vst %%v24,64(%%r1,%[y])\n\t"
"vst %%v25,80(%%r1,%[y])\n\t"
"vst %%v26,96(%%r1,%[y])\n\t"
"vst %%v27,112(%%r1,%[y])\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,128(%%r1,%[y])\n\t"
"vl %%v21,144(%%r1,%[y])\n\t"
"vl %%v22,160(%%r1,%[y])\n\t"
"vl %%v23,176(%%r1,%[y])\n\t"
"vl %%v24,192(%%r1,%[x])\n\t"
"vl %%v25,208(%%r1,%[x])\n\t"
"vl %%v26,224(%%r1,%[x])\n\t"
"vl %%v27,240(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[y])\n\t"
"vl %%v29,208(%%r1,%[y])\n\t"
"vl %%v30,224(%%r1,%[y])\n\t"
"vl %%v31,240(%%r1,%[y])\n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
"vl %%v24,192(%%r1,%1) \n\t"
"vl %%v25,208(%%r1,%1) \n\t"
"vl %%v26,224(%%r1,%1) \n\t"
"vl %%v27,240(%%r1,%1) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,128(%%r1,%2) \n\t"
"vst %%v17,144(%%r1,%2) \n\t"
"vst %%v18,160(%%r1,%2) \n\t"
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"
"vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
"vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
"vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
"vst %%v16,128(%%r1,%[y])\n\t"
"vst %%v17,144(%%r1,%[y])\n\t"
"vst %%v18,160(%%r1,%[y])\n\t"
"vst %%v19,176(%%r1,%[y])\n\t"
"vst %%v24,192(%%r1,%[y])\n\t"
"vst %%v25,208(%%r1,%[y])\n\t"
"vst %%v26,224(%%r1,%[y])\n\t"
"vst %%v27,240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
[alpha] "Q"(*alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
if ( n <= 0 ) return 0 ;
if (n <= 0)
return 0;
if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -64;
@ -135,8 +124,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
saxpy_kernel_64(n1, x, y, &da);
i = n1;
while(i < n)
{
while (i < n) {
y[i] += da * x[i];
i++;
@ -144,13 +132,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
return 0;
}
BLASLONG n1 = n & -4;
while(i < n1)
{
while (i < n1) {
FLOAT m1 = da * x[ix];
FLOAT m2 = da * x[ix + inc_x];
@ -168,8 +154,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
while(i < n)
{
while (i < n) {
y[iy] += da * x[ix];
ix += inc_x;
@ -180,5 +165,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
return 0;
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,30 +27,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,6 \n\t"
static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],6\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","r2"
);
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x)
: "cc");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
if (n <= 0) return 0;
if (n <= 0)
return 0;
if ((inc_x == 1) && (inc_y == 1)) {
@ -66,7 +62,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
}
} else {
while (i < n) {
@ -81,5 +76,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
}
return 0;
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project
Copyright (c) 2013-2019,The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are
@ -27,72 +27,82 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
FLOAT dot;
__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
__asm__("vzero %%v0\n\t"
"vzero %%v1\n\t"
"vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
"vfmasb %%v1,%%v17,%%v25,%%v1\n\t"
"vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
"vfmasb %%v3,%%v19,%%v27,%%v3\n\t"
"vfmasb %%v4,%%v20,%%v28,%%v4\n\t"
"vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
"vfmasb %%v6,%%v22,%%v30,%%v6\n\t"
"vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b \n\t"
"brctg %[n],0b\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vfasb %%v0,%%v0,%%v2\n\t"
"vfasb %%v0,%%v0,%%v3\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vfasb %%v0,%%v0,%%v5\n\t"
"vfasb %%v0,%%v0,%%v6\n\t"
"vfasb %%v0,%%v0,%%v7\n\t"
"vrepf %%v1,%%v0,1\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepf %%v3,%%v0,3\n\t"
"aebr %%f0,%%f1\n\t"
"aebr %%f0,%%f2\n\t"
"aebr %%f0,%%f3\n\t"
"ler %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ler %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return dot;
}
FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
{
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT dot = 0.0;
if ( n <= 0 ) return(dot);
if (n <= 0)
return (dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -32;
@ -100,8 +110,7 @@ FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
dot = sdot_kernel_32(n1, x, y);
i = n1;
while(i < n)
{
while (i < n) {
dot += y[i] * x[i];
i++;
@ -109,13 +118,11 @@ FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
}
return (dot);
}
BLASLONG n1 = n & -2;
while(i < n1)
{
while (i < n1) {
dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x];
ix += inc_x * 2;
@ -124,8 +131,7 @@ FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
}
while(i < n)
{
while (i < n) {
dot += y[iy] * x[ix];
ix += inc_x;
@ -136,5 +142,3 @@ FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
return (dot);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -29,364 +29,329 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NBMAX 2048
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepf %%v0,0(%5) \n\t"
"vlrepf %%v1,4(%5) \n\t"
"vlrepf %%v2,8(%5) \n\t"
"vlrepf %%v3,12(%5) \n\t"
"vlrepf %%v4,%7 \n\t"
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vlrepf %%v0,0(%[x])\n\t"
"vlrepf %%v1,4(%[x])\n\t"
"vlrepf %%v2,8(%[x])\n\t"
"vlrepf %%v3,12(%[x])\n\t"
"vlrepf %%v4,%[alpha]\n\t"
"vfmsb %%v0,%%v0,%%v4\n\t"
"vfmsb %%v1,%%v1,%%v4\n\t"
"vfmsb %%v2,%%v2,%%v4\n\t"
"vfmsb %%v3,%%v3,%%v4\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%0 \n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 2,1024(%%r1,%6) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v20,16(%%r1,%1) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,16(%%r1,%3) \n\t"
"vl %%v23,16(%%r1,%4) \n\t"
"vl %%v24,32(%%r1,%1) \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vl %%v28,48(%%r1,%1) \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vl %%v31,48(%%r1,%4) \n\t"
"vl %%v4,0(%%r1,%6) \n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v20,16(%%r1,%[ap0])\n\t"
"vl %%v21,16(%%r1,%[ap1])\n\t"
"vl %%v22,16(%%r1,%[ap2])\n\t"
"vl %%v23,16(%%r1,%[ap3])\n\t"
"vl %%v24,32(%%r1,%[ap0])\n\t"
"vl %%v25,32(%%r1,%[ap1])\n\t"
"vl %%v26,32(%%r1,%[ap2])\n\t"
"vl %%v27,32(%%r1,%[ap3])\n\t"
"vl %%v28,48(%%r1,%[ap0])\n\t"
"vl %%v29,48(%%r1,%[ap1])\n\t"
"vl %%v30,48(%%r1,%[ap2])\n\t"
"vl %%v31,48(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vl %%v5,16(%%r1,%[y])\n\t"
"vl %%v6,32(%%r1,%[y])\n\t"
"vl %%v7,48(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmasb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmasb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmasb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmasb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmasb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmasb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmasb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,0(%%r1,%6) \n\t"
"vl %%v4,16(%%r1,%6) \n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,16(%%r1,%6) \n\t"
"vl %%v4,32(%%r1,%6) \n\t"
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,32(%%r1,%6) \n\t"
"vl %%v4,48(%%r1,%6) \n\t"
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,48(%%r1,%6) \n\t"
"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,64(%%r1,%2) \n\t"
"vl %%v18,64(%%r1,%3) \n\t"
"vl %%v19,64(%%r1,%4) \n\t"
"vl %%v20,80(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,80(%%r1,%3) \n\t"
"vl %%v23,80(%%r1,%4) \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
"vl %%v4,64(%%r1,%6) \n\t"
"vfmasb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmasb %%v6,%%v27,%%v3,%%v6\n\t"
"vfmasb %%v7,%%v31,%%v3,%%v7\n\t"
"vst %%v4,0(%%r1,%[y])\n\t"
"vst %%v5,16(%%r1,%[y])\n\t"
"vst %%v6,32(%%r1,%[y])\n\t"
"vst %%v7,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[ap0])\n\t"
"vl %%v17,64(%%r1,%[ap1])\n\t"
"vl %%v18,64(%%r1,%[ap2])\n\t"
"vl %%v19,64(%%r1,%[ap3])\n\t"
"vl %%v20,80(%%r1,%[ap0])\n\t"
"vl %%v21,80(%%r1,%[ap1])\n\t"
"vl %%v22,80(%%r1,%[ap2])\n\t"
"vl %%v23,80(%%r1,%[ap3])\n\t"
"vl %%v24,96(%%r1,%[ap0])\n\t"
"vl %%v25,96(%%r1,%[ap1])\n\t"
"vl %%v26,96(%%r1,%[ap2])\n\t"
"vl %%v27,96(%%r1,%[ap3])\n\t"
"vl %%v28,112(%%r1,%[ap0])\n\t"
"vl %%v29,112(%%r1,%[ap1])\n\t"
"vl %%v30,112(%%r1,%[ap2])\n\t"
"vl %%v31,112(%%r1,%[ap3])\n\t"
"vl %%v4,64(%%r1,%[y])\n\t"
"vl %%v5,80(%%r1,%[y])\n\t"
"vl %%v6,96(%%r1,%[y])\n\t"
"vl %%v7,112(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmasb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmasb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmasb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmasb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmasb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmasb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmasb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,64(%%r1,%6) \n\t"
"vl %%v4,80(%%r1,%6) \n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,80(%%r1,%6) \n\t"
"vl %%v4,96(%%r1,%6) \n\t"
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,96(%%r1,%6) \n\t"
"vl %%v4,112(%%r1,%6) \n\t"
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,112(%%r1,%6) \n\t"
"vfmasb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmasb %%v6,%%v27,%%v3,%%v6\n\t"
"vfmasb %%v7,%%v31,%%v3,%%v7\n\t"
"vst %%v4,64(%%r1,%[y])\n\t"
"vst %%v5,80(%%r1,%[y])\n\t"
"vst %%v6,96(%%r1,%[y])\n\t"
"vst %%v7,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%0 \n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v4,0(%%r1,%6) \n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,0(%%r1,%6) \n\t"
"vst %%v4,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: "+m"(*(struct { FLOAT x[n]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
}
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepf %%v0,0(%3) \n\t"
"vlrepf %%v1,4(%3) \n\t"
"vlrepf %%v2,%5 \n\t"
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vlrepf %%v0,0(%[x])\n\t"
"vlrepf %%v1,4(%[x])\n\t"
"vlrepf %%v2,%[alpha]\n\t"
"vfmsb %%v0,%%v0,%%v2\n\t"
"vfmsb %%v1,%%v1,%%v2\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%0 \n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%4) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,16(%%r1,%1) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"vl %%v20,32(%%r1,%1) \n\t"
"vl %%v21,32(%%r1,%2) \n\t"
"vl %%v22,48(%%r1,%1) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vl %%v26,80(%%r1,%1) \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vl %%v28,96(%%r1,%1) \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%1) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vl %%v2,0(%%r1,%4) \n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,16(%%r1,%[ap0])\n\t"
"vl %%v19,16(%%r1,%[ap1])\n\t"
"vl %%v20,32(%%r1,%[ap0])\n\t"
"vl %%v21,32(%%r1,%[ap1])\n\t"
"vl %%v22,48(%%r1,%[ap0])\n\t"
"vl %%v23,48(%%r1,%[ap1])\n\t"
"vl %%v24,64(%%r1,%[ap0])\n\t"
"vl %%v25,64(%%r1,%[ap1])\n\t"
"vl %%v26,80(%%r1,%[ap0])\n\t"
"vl %%v27,80(%%r1,%[ap1])\n\t"
"vl %%v28,96(%%r1,%[ap0])\n\t"
"vl %%v29,96(%%r1,%[ap1])\n\t"
"vl %%v30,112(%%r1,%[ap0])\n\t"
"vl %%v31,112(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vl %%v3,16(%%r1,%[y])\n\t"
"vl %%v4,32(%%r1,%[y])\n\t"
"vl %%v5,48(%%r1,%[y])\n\t"
"vl %%v6,64(%%r1,%[y])\n\t"
"vl %%v7,80(%%r1,%[y])\n\t"
"vl %%v8,96(%%r1,%[y])\n\t"
"vl %%v9,112(%%r1,%[y])\n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmasb %%v3,%%v18,%%v0,%%v3\n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4\n\t"
"vfmasb %%v5,%%v22,%%v0,%%v5\n\t"
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmasb %%v7,%%v26,%%v0,%%v7\n\t"
"vfmasb %%v8,%%v28,%%v0,%%v8\n\t"
"vfmasb %%v9,%%v30,%%v0,%%v9\n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2\n\t"
"vst %%v2,0(%%r1,%4) \n\t"
"vl %%v2,16(%%r1,%4) \n\t"
"vfmasb %%v2,%%v18,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v19,%%v1,%%v2 \n\t"
"vst %%v2,16(%%r1,%4) \n\t"
"vl %%v2,32(%%r1,%4) \n\t"
"vfmasb %%v2,%%v20,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v21,%%v1,%%v2 \n\t"
"vst %%v2,32(%%r1,%4) \n\t"
"vl %%v2,48(%%r1,%4) \n\t"
"vfmasb %%v2,%%v22,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v23,%%v1,%%v2 \n\t"
"vst %%v2,48(%%r1,%4) \n\t"
"vl %%v2,64(%%r1,%4) \n\t"
"vfmasb %%v2,%%v24,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v25,%%v1,%%v2 \n\t"
"vst %%v2,64(%%r1,%4) \n\t"
"vl %%v2,80(%%r1,%4) \n\t"
"vfmasb %%v2,%%v26,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v27,%%v1,%%v2 \n\t"
"vst %%v2,80(%%r1,%4) \n\t"
"vl %%v2,96(%%r1,%4) \n\t"
"vfmasb %%v2,%%v28,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v29,%%v1,%%v2 \n\t"
"vst %%v2,96(%%r1,%4) \n\t"
"vl %%v2,112(%%r1,%4) \n\t"
"vfmasb %%v2,%%v30,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v31,%%v1,%%v2 \n\t"
"vst %%v2,112(%%r1,%4) \n\t"
"vfmasb %%v3,%%v19,%%v1,%%v3\n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4\n\t"
"vfmasb %%v5,%%v23,%%v1,%%v5\n\t"
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmasb %%v7,%%v27,%%v1,%%v7\n\t"
"vfmasb %%v8,%%v29,%%v1,%%v8\n\t"
"vfmasb %%v9,%%v31,%%v1,%%v9\n\t"
"vst %%v2,0(%%r1,%[y])\n\t"
"vst %%v3,16(%%r1,%[y])\n\t"
"vst %%v4,32(%%r1,%[y])\n\t"
"vst %%v5,48(%%r1,%[y])\n\t"
"vst %%v6,64(%%r1,%[y])\n\t"
"vst %%v7,80(%%r1,%[y])\n\t"
"vst %%v8,96(%%r1,%[y])\n\t"
"vst %%v9,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%0 \n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v2,0(%%r1,%4) \n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2\n\t"
"vst %%v2,0(%%r1,%4) \n\t"
"vst %%v2,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: "+m"(*(struct { FLOAT x[n]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepf %%v0,0(%2) \n\t"
"vlrepf %%v1,%4 \n\t"
"vfmsb %%v0,%%v0,%%v1 \n\t"
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
__asm__("vlrepf %%v0,0(%[x])\n\t"
"vlrepf %%v16,%[alpha]\n\t"
"vfmsb %%v0,%%v0,%%v16\n\t"
"xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t"
"ngr %%r0,%0 \n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%1) \n\t"
"vl %%v22,96(%%r1,%1) \n\t"
"vl %%v23,112(%%r1,%1) \n\t"
"vl %%v1,0(%%r1,%3) \n\t"
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"
"vl %%v1,16(%%r1,%3) \n\t"
"vfmasb %%v1,%%v17,%%v0,%%v1 \n\t"
"vst %%v1,16(%%r1,%3) \n\t"
"vl %%v1,32(%%r1,%3) \n\t"
"vfmasb %%v1,%%v18,%%v0,%%v1 \n\t"
"vst %%v1,32(%%r1,%3) \n\t"
"vl %%v1,48(%%r1,%3) \n\t"
"vfmasb %%v1,%%v19,%%v0,%%v1 \n\t"
"vst %%v1,48(%%r1,%3) \n\t"
"vl %%v1,64(%%r1,%3) \n\t"
"vfmasb %%v1,%%v20,%%v0,%%v1 \n\t"
"vst %%v1,64(%%r1,%3) \n\t"
"vl %%v1,80(%%r1,%3) \n\t"
"vfmasb %%v1,%%v21,%%v0,%%v1 \n\t"
"vst %%v1,80(%%r1,%3) \n\t"
"vl %%v1,96(%%r1,%3) \n\t"
"vfmasb %%v1,%%v22,%%v0,%%v1 \n\t"
"vst %%v1,96(%%r1,%3) \n\t"
"vl %%v1,112(%%r1,%3) \n\t"
"vfmasb %%v1,%%v23,%%v0,%%v1 \n\t"
"vst %%v1,112(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%[a0])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,16(%%r1,%[a0])\n\t"
"vl %%v18,32(%%r1,%[a0])\n\t"
"vl %%v19,48(%%r1,%[a0])\n\t"
"vl %%v20,64(%%r1,%[a0])\n\t"
"vl %%v21,80(%%r1,%[a0])\n\t"
"vl %%v22,96(%%r1,%[a0])\n\t"
"vl %%v23,112(%%r1,%[a0])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v17,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v18,%%v0,%%v26\n\t"
"vfmasb %%v27,%%v19,%%v0,%%v27\n\t"
"vfmasb %%v28,%%v20,%%v0,%%v28\n\t"
"vfmasb %%v29,%%v21,%%v0,%%v29\n\t"
"vfmasb %%v30,%%v22,%%v0,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v0,%%v31\n\t"
"vst %%v24,0(%%r1,%[y])\n\t"
"vst %%v25,16(%%r1,%[y])\n\t"
"vst %%v26,32(%%r1,%[y])\n\t"
"vst %%v27,48(%%r1,%[y])\n\t"
"vst %%v28,64(%%r1,%[y])\n\t"
"vst %%v29,80(%%r1,%[y])\n\t"
"vst %%v30,96(%%r1,%[y])\n\t"
"vst %%v31,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t"
"1:\n\t"
"lghi %%r0,28\n\t"
"ngr %%r0,%0 \n\t"
"ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t"
"jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t"
"2:\n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v1,0(%%r1,%3) \n\t"
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,0(%%r1,%[y])\n\t"
"vfmasb %%v17,%%v16,%%v0,%%v17\n\t"
"vst %%v17,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t"
"3:\n\t"
"nop"
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
: "+m"(*(struct { FLOAT x[n]; } *) y)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
"m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) {
BLASLONG i;
for (i = 0; i < n; i++)
{
for (i = 0; i < n; i++) {
*dest += src[i];
dest += inc_dest;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *buffer) {
BLASLONG i;
FLOAT *a_ptr;
FLOAT *x_ptr;
@ -400,8 +365,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG lda4 = lda << 2;
FLOAT xbuffer[8], *ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
if (m < 1)
return (0);
if (n < 1)
return (0);
ybuffer = buffer;
@ -416,13 +383,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
while (NB == NBMAX) {
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
if (m1 < 0) {
if (m2 == 0)
break;
NB = m2;
}
@ -439,12 +405,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
else
ybuffer = y_ptr;
if ( inc_x == 1 )
{
if (inc_x == 1) {
for( i = 0; i < n1 ; i++)
{
for (i = 0; i < n1; i++) {
sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha);
ap[0] += lda4;
ap[1] += lda4;
@ -454,29 +417,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr += 4;
}
if ( n2 & 2 )
{
if (n2 & 2) {
sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha);
a_ptr += lda * 2;
x_ptr += 2;
}
if ( n2 & 1 )
{
if (n2 & 1) {
sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha);
/* a_ptr += lda;
x_ptr += 1; */
}
} else {
}
else
{
for( i = 0; i < n1 ; i++)
{
for (i = 0; i < n1; i++) {
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
@ -493,8 +449,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
for (i = 0; i < n2; i++) {
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha);
@ -505,30 +460,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
}
a += NB;
if ( inc_y != 1 )
{
if (inc_y != 1) {
add_y(NB, ybuffer, y_ptr, inc_y);
y_ptr += NB * inc_y;
}
else
} else
y_ptr += NB;
}
if ( m3 == 0 ) return(0);
if (m3 == 0)
return (0);
if ( m3 == 3 )
{
if (m3 == 3) {
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 )
{
if (lda == 3 && inc_x == 1) {
for( i = 0; i < ( n & -4 ); i+=4 )
{
for (i = 0; i < (n & -4); i += 4) {
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
@ -542,8 +493,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr += 4;
}
for( ; i < n; i++ )
{
for (; i < n; i++) {
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
@ -551,19 +501,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr++;
}
}
else
{
} else {
for( i = 0; i < n; i++ )
{
for (i = 0; i < n; i++) {
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
@ -575,18 +521,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return (0);
}
if ( m3 == 2 )
{
if (m3 == 2) {
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{
if (lda == 2 && inc_x == 1) {
for( i = 0; i < (n & -4) ; i+=4 )
{
for (i = 0; i < (n & -4); i += 4) {
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
@ -596,27 +538,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
}
for( ; i < n; i++ )
{
for (; i < n; i++) {
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr++;
}
}
else
{
} else {
for( i = 0; i < n; i++ )
{
for (i = 0; i < n; i++) {
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}
}
@ -626,31 +562,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return (0);
}
if ( m3 == 1 )
{
if (m3 == 1) {
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{
if (lda == 1 && inc_x == 1) {
for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
for (i = 0; i < (n & -4); i += 4) {
temp +=
a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i +
2] *
x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3];
}
for( ; i < n; i++ )
{
for (; i < n; i++) {
temp += a_ptr[i] * x_ptr[i];
}
}
else
{
} else {
for( i = 0; i < n; i++ )
{
for (i = 0; i < n; i++) {
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
@ -661,8 +593,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return (0);
}
return (0);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,34 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x)
{
static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT max;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
"vfmaxsb %%v17,%%v17,%%v25,0\n\t"
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
@ -63,32 +59,25 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x)
"vfmaxsb %%v21,%%v21,%%v29,0\n\t"
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
"vfmaxsb %%v23,%%v23,%%v31,0\n\t"
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
"vfmaxsb %%v17,%%v17,%%v21,0\n\t"
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
"vfmaxsb %%v19,%%v19,%%v23,0\n\t"
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
"vfmaxsb %%v17,%%v17,%%v19,0\n\t"
"vfmaxsb %%v16,%%v16,%%v17,0\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
"ler %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ler %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return max;
}
@ -98,7 +87,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0;
FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf);
if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) {
@ -108,9 +98,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = smax_kernel_64(n1, x);
i = n1;
}
else
{
} else {
maxf = x[0];
i++;
}
@ -149,7 +137,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,34 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x)
{
static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT min;
__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
__asm__("vl %%v0,0(%[x])\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfminsb %%v16,%%v16,%%v24,0\n\t"
"vfminsb %%v17,%%v17,%%v25,0\n\t"
"vfminsb %%v18,%%v18,%%v26,0\n\t"
@ -63,32 +59,25 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x)
"vfminsb %%v21,%%v21,%%v29,0\n\t"
"vfminsb %%v22,%%v22,%%v30,0\n\t"
"vfminsb %%v23,%%v23,%%v31,0\n\t"
"vfminsb %%v16,%%v16,%%v20,0\n\t"
"vfminsb %%v17,%%v17,%%v21,0\n\t"
"vfminsb %%v18,%%v18,%%v22,0\n\t"
"vfminsb %%v19,%%v19,%%v23,0\n\t"
"vfminsb %%v16,%%v16,%%v18,0\n\t"
"vfminsb %%v17,%%v17,%%v19,0\n\t"
"vfminsb %%v16,%%v16,%%v17,0\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,0\n\t"
"ler %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ler %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return min;
}
@ -98,7 +87,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf);
if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) {
@ -108,9 +98,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = smin_kernel_64(n1, x);
i = n1;
}
else
{
} else {
minf = x[0];
i++;
}
@ -149,7 +137,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,25 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepf %%v0,%3 \n\t"
"vlrepf %%v1,%4 \n\t"
"srlg %%r0,%0,6 \n\t"
static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
__asm__("vlrepf %%v0,%[c]\n\t"
"vlrepf %%v1,%[s]\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -63,25 +60,22 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -99,25 +93,22 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -135,25 +126,22 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -171,39 +159,38 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT c, FLOAT s) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp;
if ( n <= 0 ) return(0);
if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
if (n1 > 0) {
FLOAT cosa, sina;
cosa = c;
sina = s;
@ -211,8 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
i = n1;
}
while(i < n)
{
while (i < n) {
temp = c * x[i] + s * y[i];
y[i] = c * y[i] - s * x[i];
x[i] = temp;
@ -221,13 +207,9 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
}
} else {
}
else
{
while(i < n)
{
while (i < n) {
temp = c * x[ix] + s * y[iy];
y[iy] = c * y[iy] - s * x[ix];
x[ix] = temp;
@ -242,5 +224,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
return (0);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,128 +27,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x)
{
__asm__ volatile (
"vlrepf %%v0,%1 \n\t"
"srlg %%r0,%0,5 \n\t"
static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) {
__asm__("vlrepf %%v0,%[da]\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[x])\n\t"
"vfmsb %%v24,%%v24,%%v0\n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vst %%v24,0(%%r1,%[x])\n\t"
"vl %%v25,16(%%r1,%[x])\n\t"
"vfmsb %%v25,%%v25,%%v0\n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vst %%v25,16(%%r1,%[x])\n\t"
"vl %%v26,32(%%r1,%[x])\n\t"
"vfmsb %%v26,%%v26,%%v0\n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vst %%v26,32(%%r1,%[x])\n\t"
"vl %%v27,48(%%r1,%[x])\n\t"
"vfmsb %%v27,%%v27,%%v0\n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%2) \n\t"
"vfmsb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%2) \n\t"
"vfmsb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%2) \n\t"
"vfmsb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vl %%v27, 112(%%r1,%2) \n\t"
"vfmsb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"vst %%v27,48(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[x])\n\t"
"vfmsb %%v28,%%v28,%%v0\n\t"
"vst %%v28,64(%%r1,%[x])\n\t"
"vl %%v29,80(%%r1,%[x])\n\t"
"vfmsb %%v29,%%v29,%%v0\n\t"
"vst %%v29,80(%%r1,%[x])\n\t"
"vl %%v30,96(%%r1,%[x])\n\t"
"vfmsb %%v30,%%v30,%%v0\n\t"
"vst %%v30,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vfmsb %%v31,%%v31,%%v0\n\t"
"vst %%v31,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
: [x] "a"(x),[da] "Q"(da)
: "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,5 \n\t"
static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) {
__asm__("vzero %%v0\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
if (n <= 0 || inc_x <= 0)
return (0);
if (inc_x == 1) {
if ( inc_x == 1 )
{
if ( da == 0.0 )
{
if (da == 0.0) {
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
if (n1 > 0) {
sscal_kernel_32_zero(n1, x);
j = n1;
}
while(j < n)
{
while (j < n) {
x[j] = 0.0;
j++;
}
}
else
{
} else {
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
if (n1 > 0) {
sscal_kernel_32(n1, da, x);
j = n1;
}
while(j < n)
{
while (j < n) {
x[j] = da * x[j];
j++;
}
}
} else {
}
else
{
if ( da == 0.0 )
{
if (da == 0.0) {
BLASLONG n1 = n & -2;
@ -161,17 +139,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
j += 2;
}
while(j < n)
{
while (j < n) {
x[i] = 0.0;
i += inc_x;
j++;
}
}
else
{
} else {
BLASLONG n1 = n & -2;
while (j < n1) {
@ -184,8 +159,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
while(j < n)
{
while (j < n) {
x[i] = da * x[i];
i += inc_x;
@ -197,5 +171,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
return 0;
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,111 +27,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,6 \n\t"
static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp;
if ( n <= 0 ) return(0);
if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
if (n1 > 0) {
sswap_kernel_64(n1, x, y);
i = n1;
}
while(i < n)
{
while (i < n) {
temp = y[i];
y[i] = x[i];
x[i] = temp;
@ -139,13 +133,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
}
} else {
}
else
{
while(i < n)
{
while (i < n) {
temp = y[iy];
y[iy] = x[ix];
x[ix] = temp;
@ -158,7 +148,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
}
return (0);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,64 +28,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
{
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amax;
__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %%r0,%1,4 \n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vleg %%v24,128(%%r1,%2),0 \n\t"
"vleg %%v25,136(%%r1,%2),0 \n\t"
"vleg %%v24,144(%%r1,%2),1 \n\t"
"vleg %%v25,152(%%r1,%2),1 \n\t"
"vleg %%v26,160(%%r1,%2),0 \n\t"
"vleg %%v27,168(%%r1,%2),0 \n\t"
"vleg %%v26,176(%%r1,%2),1 \n\t"
"vleg %%v27,184(%%r1,%2),1 \n\t"
"vleg %%v28,192(%%r1,%2),0 \n\t"
"vleg %%v29,200(%%r1,%2),0 \n\t"
"vleg %%v28,208(%%r1,%2),1 \n\t"
"vleg %%v29,216(%%r1,%2),1 \n\t"
"vleg %%v30,224(%%r1,%2),0 \n\t"
"vleg %%v31,232(%%r1,%2),0 \n\t"
"vleg %%v30,240(%%r1,%2),1 \n\t"
"vleg %%v31,248(%%r1,%2),1 \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v24,128(%%r1,%[x]),0\n\t"
"vleg %%v25,136(%%r1,%[x]),0\n\t"
"vleg %%v24,144(%%r1,%[x]),1\n\t"
"vleg %%v25,152(%%r1,%[x]),1\n\t"
"vleg %%v26,160(%%r1,%[x]),0\n\t"
"vleg %%v27,168(%%r1,%[x]),0\n\t"
"vleg %%v26,176(%%r1,%[x]),1\n\t"
"vleg %%v27,184(%%r1,%[x]),1\n\t"
"vleg %%v28,192(%%r1,%[x]),0\n\t"
"vleg %%v29,200(%%r1,%[x]),0\n\t"
"vleg %%v28,208(%%r1,%[x]),1\n\t"
"vleg %%v29,216(%%r1,%[x]),1\n\t"
"vleg %%v30,224(%%r1,%[x]),0\n\t"
"vleg %%v31,232(%%r1,%[x]),0\n\t"
"vleg %%v30,240(%%r1,%[x]),1\n\t"
"vleg %%v31,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16,%%v16\n\t"
"vflpdb %%v17,%%v17\n\t"
"vflpdb %%v18,%%v18\n\t"
@ -102,7 +92,6 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
"vflpdb %%v29,%%v29\n\t"
"vflpdb %%v30,%%v30\n\t"
"vflpdb %%v31,%%v31\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v18,%%v18,%%v19\n\t"
"vfadb %%v20,%%v20,%%v21\n\t"
@ -111,29 +100,23 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
"vfadb %%v26,%%v26,%%v27\n\t"
"vfadb %%v28,%%v28,%%v29\n\t"
"vfadb %%v30,%%v30,%%v31\n\t"
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
"vfmaxdb %%v20,%%v20,%%v28,0\n\t"
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amax;
}
@ -144,7 +127,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf);
if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) {
@ -154,9 +138,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = zamax_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
} else {
maxf = CABS1(x, 0);
ix += 2;
i++;
@ -198,7 +180,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,47 +28,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
{
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amax;
__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %%r0,%1,4 \n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -81,34 +72,30 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v24,%%v25\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v26,%%v0\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"vleg %%v16,128(%%r1,%2),0 \n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -121,29 +108,24 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v24,%%v25\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v26,%%v0\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27");
return amax;
}
@ -154,7 +136,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf);
if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) {
@ -164,9 +147,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = zamax_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
} else {
maxf = CABS1(x, 0);
ix += 2;
i++;
@ -208,7 +189,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,64 +28,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
{
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amin;
__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %%r0,%1,4 \n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vleg %%v24,128(%%r1,%2),0 \n\t"
"vleg %%v25,136(%%r1,%2),0 \n\t"
"vleg %%v24,144(%%r1,%2),1 \n\t"
"vleg %%v25,152(%%r1,%2),1 \n\t"
"vleg %%v26,160(%%r1,%2),0 \n\t"
"vleg %%v27,168(%%r1,%2),0 \n\t"
"vleg %%v26,176(%%r1,%2),1 \n\t"
"vleg %%v27,184(%%r1,%2),1 \n\t"
"vleg %%v28,192(%%r1,%2),0 \n\t"
"vleg %%v29,200(%%r1,%2),0 \n\t"
"vleg %%v28,208(%%r1,%2),1 \n\t"
"vleg %%v29,216(%%r1,%2),1 \n\t"
"vleg %%v30,224(%%r1,%2),0 \n\t"
"vleg %%v31,232(%%r1,%2),0 \n\t"
"vleg %%v30,240(%%r1,%2),1 \n\t"
"vleg %%v31,248(%%r1,%2),1 \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v24,128(%%r1,%[x]),0\n\t"
"vleg %%v25,136(%%r1,%[x]),0\n\t"
"vleg %%v24,144(%%r1,%[x]),1\n\t"
"vleg %%v25,152(%%r1,%[x]),1\n\t"
"vleg %%v26,160(%%r1,%[x]),0\n\t"
"vleg %%v27,168(%%r1,%[x]),0\n\t"
"vleg %%v26,176(%%r1,%[x]),1\n\t"
"vleg %%v27,184(%%r1,%[x]),1\n\t"
"vleg %%v28,192(%%r1,%[x]),0\n\t"
"vleg %%v29,200(%%r1,%[x]),0\n\t"
"vleg %%v28,208(%%r1,%[x]),1\n\t"
"vleg %%v29,216(%%r1,%[x]),1\n\t"
"vleg %%v30,224(%%r1,%[x]),0\n\t"
"vleg %%v31,232(%%r1,%[x]),0\n\t"
"vleg %%v30,240(%%r1,%[x]),1\n\t"
"vleg %%v31,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16,%%v16\n\t"
"vflpdb %%v17,%%v17\n\t"
"vflpdb %%v18,%%v18\n\t"
@ -102,7 +92,6 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
"vflpdb %%v29,%%v29\n\t"
"vflpdb %%v30,%%v30\n\t"
"vflpdb %%v31,%%v31\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v18,%%v18,%%v19\n\t"
"vfadb %%v20,%%v20,%%v21\n\t"
@ -111,29 +100,23 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
"vfadb %%v26,%%v26,%%v27\n\t"
"vfadb %%v28,%%v28,%%v29\n\t"
"vfadb %%v30,%%v30,%%v31\n\t"
"vfmindb %%v16,%%v16,%%v24,0\n\t"
"vfmindb %%v18,%%v18,%%v26,0\n\t"
"vfmindb %%v20,%%v20,%%v28,0\n\t"
"vfmindb %%v22,%%v22,%%v30,0\n\t"
"vfmindb %%v16,%%v16,%%v20,0\n\t"
"vfmindb %%v18,%%v18,%%v22,0\n\t"
"vfmindb %%v16,%%v16,%%v18,0\n\t"
"vfmindb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,0\n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amin;
}
@ -144,7 +127,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf);
if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) {
@ -154,9 +138,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = zamin_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
} else {
minf = CABS1(x, 0);
ix += 2;
i++;
@ -198,7 +180,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,47 +28,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
{
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amin;
__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
__asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v16,8(%[x]),0\n\t"
"vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %%r0,%1,4 \n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -81,34 +72,30 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v25,%%v24\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v0,%%v26\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"vleg %%v16,128(%%r1,%2),0 \n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -121,29 +108,24 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v25,%%v24\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v0,%%v26\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27");
return amin;
}
@ -154,7 +136,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf);
if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) {
@ -164,9 +147,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = zamin_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
} else {
minf = CABS1(x, 0);
ix += 2;
i++;
@ -208,7 +189,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
}
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -28,34 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
{
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,4 \n\t"
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -64,25 +61,22 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
@ -91,68 +85,64 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v2 \n\t"
"vfadb %%v0,%%v0,%%v3 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ip = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf);
if (n <= 0 || inc_x <= 0)
return (sumf);
if ( inc_x == 1 )
{
if (inc_x == 1) {
n1 = n & -16;
if ( n1 > 0 )
{
if (n1 > 0) {
sumf = zasum_kernel_16(n1, x);
i = n1;
ip = 2 * n1;
}
while(i < n)
{
while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
i++;
ip += 2;
}
}
else
{
} else {
inc_x2 = 2 * inc_x;
while(i < n)
{
while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
ip += inc_x2;
i++;
@ -161,5 +151,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return (sumf);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,96 +27,91 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
__asm__(
#if !defined(CONJ)
"vlrepg %%v0,0(%3) \n\t"
"vleg %%v1,8(%3),0 \n\t"
"vlrepg %%v0,0(%[alpha])\n\t"
"vleg %%v1,8(%[alpha]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%3),1 \n\t"
"vleg %%v1,8(%[alpha]),1\n\t"
#else
"vleg %%v0,0(%3),1 \n\t"
"vleg %%v0,0(%[alpha]),1\n\t"
"vflcdb %%v0,%%v0\n\t"
"vleg %%v0,0(%3),0 \n\t"
"vlrepg %%v1,8(%3) \n\t"
"vleg %%v0,0(%[alpha]),0\n\t"
"vlrepg %%v1,8(%[alpha])\n\t"
#endif
"srlg %%r0,%0,3 \n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t"
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"
"vst %%v28,0(%%r1,%2) \n\t"
"vst %%v29,16(%%r1,%2) \n\t"
"vst %%v30,32(%%r1,%2) \n\t"
"vst %%v31,48(%%r1,%2) \n\t"
"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,80(%%r1,%1) \n\t"
"vl %%v18,96(%%r1,%1) \n\t"
"vl %%v19,112(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t"
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"
"vst %%v28,64(%%r1,%2) \n\t"
"vst %%v29,80(%%r1,%2) \n\t"
"vst %%v30,96(%%r1,%2) \n\t"
"vst %%v31,112(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v8,0(%%r1,%[x])\n\t"
"vl %%v9,16(%%r1,%[x])\n\t"
"vl %%v10,32(%%r1,%[x])\n\t"
"vl %%v11,48(%%r1,%[x])\n\t"
"vl %%v12,0(%%r1,%[y])\n\t"
"vl %%v13,16(%%r1,%[y])\n\t"
"vl %%v14,32(%%r1,%[y])\n\t"
"vl %%v15,48(%%r1,%[y])\n\t"
"vl %%v16,64(%%r1,%[x])\n\t"
"vl %%v17,80(%%r1,%[x])\n\t"
"vl %%v18,96(%%r1,%[x])\n\t"
"vl %%v19,112(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[y])\n\t"
"vl %%v21,80(%%r1,%[y])\n\t"
"vl %%v22,96(%%r1,%[y])\n\t"
"vl %%v23,112(%%r1,%[y])\n\t"
"vpdi %%v24,%%v8,%%v8,4\n\t"
"vpdi %%v25,%%v9,%%v9,4\n\t"
"vpdi %%v26,%%v10,%%v10,4\n\t"
"vpdi %%v27,%%v11,%%v11,4\n\t"
"vpdi %%v28,%%v16,%%v16,4\n\t"
"vpdi %%v29,%%v17,%%v17,4\n\t"
"vpdi %%v30,%%v18,%%v18,4\n\t"
"vpdi %%v31,%%v19,%%v19,4\n\t"
"vfmadb %%v8,%%v8,%%v0,%%v12\n\t"
"vfmadb %%v9,%%v9,%%v0,%%v13\n\t"
"vfmadb %%v10,%%v10,%%v0,%%v14\n\t"
"vfmadb %%v11,%%v11,%%v0,%%v15\n\t"
"vfmadb %%v16,%%v16,%%v0,%%v20\n\t"
"vfmadb %%v17,%%v17,%%v0,%%v21\n\t"
"vfmadb %%v18,%%v18,%%v0,%%v22\n\t"
"vfmadb %%v19,%%v19,%%v0,%%v23\n\t"
"vfmadb %%v8,%%v24,%%v1,%%v8\n\t"
"vfmadb %%v9,%%v25,%%v1,%%v9\n\t"
"vfmadb %%v10,%%v26,%%v1,%%v10\n\t"
"vfmadb %%v11,%%v27,%%v1,%%v11\n\t"
"vfmadb %%v16,%%v28,%%v1,%%v16\n\t"
"vfmadb %%v17,%%v29,%%v1,%%v17\n\t"
"vfmadb %%v18,%%v30,%%v1,%%v18\n\t"
"vfmadb %%v19,%%v31,%%v1,%%v19\n\t"
"vst %%v8,0(%%r1,%[y])\n\t"
"vst %%v9,16(%%r1,%[y])\n\t"
"vst %%v10,32(%%r1,%[y])\n\t"
"vst %%v11,48(%%r1,%[y])\n\t"
"vst %%v16,64(%%r1,%[y])\n\t"
"vst %%v17,80(%%r1,%[y])\n\t"
"vst %%v18,96(%%r1,%[y])\n\t"
"vst %%v19,112(%%r1,%[y])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2] __attribute__ ((aligned(16)));
if (n <= 0) return (0);
if (n <= 0)
return (0);
if ((inc_x == 1) && (inc_y == 1)) {
@ -143,7 +138,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
}
return (0);
}
inc_x *= 2;
@ -166,5 +160,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
return (0);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,46 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,4 \n\t"
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],4\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","r2"
);
"pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%[y]),0(%[x])\n\t"
"la %[x],256(%[x])\n\t"
"la %[y],256(%[y])\n\t"
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x)
: "cc");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
if ( n <= 0 ) return(0);
if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
if (n1 > 0) {
zcopy_kernel_16(n1, x, y);
i = n1;
ix = n1 * 2;
iy = n1 * 2;
}
while(i < n)
{
while (i < n) {
y[iy] = x[iy];
y[iy + 1] = x[ix + 1];
ix += 2;
@ -75,16 +68,12 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
}
}
else
{
} else {
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
while(i < n)
{
while (i < n) {
y[iy] = x[ix];
y[iy + 1] = x[ix + 1];
ix += inc_x2;

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,10 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
__asm__ volatile(
"vzero %%v24 \n\t"
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
@ -38,25 +36,23 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %%r0,%0,3 \n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
@ -65,20 +61,18 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
"vl %%v16, 64(%%r1,%1) \n\t"
"vl %%v17, 80(%%r1,%1) \n\t"
"vl %%v18, 96(%%r1,%1) \n\t"
"vl %%v19, 112(%%r1,%1) \n\t"
"vl %%v0, 64(%%r1,%2) \n\t"
"vl %%v1, 80(%%r1,%2) \n\t"
"vl %%v2, 96(%%r1,%2) \n\t"
"vl %%v3, 112(%%r1,%2) \n\t"
"vl %%v16, 64(%%r1,%[x])\n\t"
"vl %%v17, 80(%%r1,%[x])\n\t"
"vl %%v18, 96(%%r1,%[x])\n\t"
"vl %%v19, 112(%%r1,%[x])\n\t"
"vl %%v0, 64(%%r1,%[y])\n\t"
"vl %%v1, 80(%%r1,%[y])\n\t"
"vl %%v2, 96(%%r1,%[y])\n\t"
"vl %%v3, 112(%%r1,%[y])\n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
@ -87,30 +81,33 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b \n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v25,%%v25,%%v27\n\t"
"vfadb %%v25,%%v25,%%v29\n\t"
"vfadb %%v25,%%v25,%%v31\n\t"
"vsteg %%v24,0(%3),0 \n\t"
"vsteg %%v24,8(%3),1 \n\t"
"vsteg %%v25,16(%3),1 \n\t"
"vsteg %%v25,24(%3),0 "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"vsteg %%v24,0(%[d]),0\n\t"
"vsteg %%v24,8(%[d]),1\n\t"
"vsteg %%v25,16(%[d]),1\n\t"
"vsteg %%v25,24(%[d]),0"
: "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
: [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y) {
BLASLONG i;
BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
FLOAT dot[4] __attribute__ ((aligned(16))) = {
0.0, 0.0, 0.0, 0.0};
if (n <= 0) {
CREAL(result) = 0.0;
@ -141,7 +138,6 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
}
} else {
i = 0;
ix = 0;
@ -174,5 +170,3 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
return (result);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -25,276 +25,259 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "common.h"
#define NBMAX 1024
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vl %%v16,0(%5) \n\t"
"vl %%v17,16(%5) \n\t"
"vl %%v18,32(%5) \n\t"
"vl %%v19,48(%5) \n\t"
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vl %%v16,0(%[x])\n\t"
"vl %%v17,16(%[x])\n\t"
"vl %%v18,32(%[x])\n\t"
"vl %%v19,48(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v20,8(%5),0 \n\t"
"vleg %%v20,8(%[x]),0\n\t"
"wflcdb %%v20,%%v20\n\t"
"vleg %%v20,0(%5),1 \n\t"
"vleg %%v21,24(%5),0 \n\t"
"vleg %%v20,0(%[x]),1\n\t"
"vleg %%v21,24(%[x]),0\n\t"
"wflcdb %%v21,%%v21\n\t"
"vleg %%v21,16(%5),1 \n\t"
"vleg %%v22,40(%5),0 \n\t"
"vleg %%v21,16(%[x]),1\n\t"
"vleg %%v22,40(%[x]),0\n\t"
"wflcdb %%v22,%%v22\n\t"
"vleg %%v22,32(%5),1 \n\t"
"vleg %%v23,56(%5),0 \n\t"
"vleg %%v22,32(%[x]),1\n\t"
"vleg %%v23,56(%[x]),0\n\t"
"wflcdb %%v23,%%v23\n\t"
"vleg %%v23,48(%5),1 \n\t"
"vleg %%v23,48(%[x]),1\n\t"
#else
"vleg %%v20,0(%5),1 \n\t"
"vleg %%v20,0(%[x]),1\n\t"
"vflcdb %%v20,%%v20\n\t"
"vleg %%v20,8(%5),0 \n\t"
"vleg %%v21,16(%5),1 \n\t"
"vleg %%v20,8(%[x]),0\n\t"
"vleg %%v21,16(%[x]),1\n\t"
"vflcdb %%v21,%%v21\n\t"
"vleg %%v21,24(%5),0 \n\t"
"vleg %%v22,32(%5),1 \n\t"
"vleg %%v21,24(%[x]),0\n\t"
"vleg %%v22,32(%[x]),1\n\t"
"vflcdb %%v22,%%v22\n\t"
"vleg %%v22,40(%5),0 \n\t"
"vleg %%v23,48(%5),1 \n\t"
"vleg %%v22,40(%[x]),0\n\t"
"vleg %%v23,48(%[x]),1\n\t"
"vflcdb %%v23,%%v23\n\t"
"vleg %%v23,56(%5),0 \n\t"
"vleg %%v23,56(%[x]),0\n\t"
#endif
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 2,1024(%%r1,%6) \n\t"
"vlrepg %%v24,0(%%r1,%1) \n\t"
"vlrepg %%v25,8(%%r1,%1) \n\t"
"vlrepg %%v26,0(%%r1,%2) \n\t"
"vlrepg %%v27,8(%%r1,%2) \n\t"
"vl %%v0,0(%%r1,%6) \n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vl %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v24,0(%%r1,%[ap0])\n\t"
"vlrepg %%v25,8(%%r1,%[ap0])\n\t"
"vlrepg %%v26,0(%%r1,%[ap1])\n\t"
"vlrepg %%v27,8(%%r1,%[ap1])\n\t"
"vlrepg %%v28,16(%%r1,%[ap0])\n\t"
"vlrepg %%v29,24(%%r1,%[ap0])\n\t"
"vlrepg %%v30,16(%%r1,%[ap1])\n\t"
"vlrepg %%v31,24(%%r1,%[ap1])\n\t"
"vfmadb %%v0,%%v24,%%v16,%%v0\n\t"
"vfmadb %%v1,%%v28,%%v16,%%v1\n\t"
"vfmadb %%v0,%%v25,%%v20,%%v0\n\t"
"vfmadb %%v1,%%v29,%%v20,%%v1\n\t"
"vfmadb %%v0,%%v26,%%v17,%%v0\n\t"
"vfmadb %%v1,%%v30,%%v17,%%v1\n\t"
"vfmadb %%v0,%%v27,%%v21,%%v0\n\t"
"vlrepg %%v28,0(%%r1,%3) \n\t"
"vlrepg %%v29,8(%%r1,%3) \n\t"
"vlrepg %%v30,0(%%r1,%4) \n\t"
"vlrepg %%v31,8(%%r1,%4) \n\t"
"vfmadb %%v0,%%v28,%%v18,%%v0 \n\t"
"vfmadb %%v0,%%v29,%%v22,%%v0 \n\t"
"vfmadb %%v0,%%v30,%%v19,%%v0 \n\t"
"vfmadb %%v0,%%v31,%%v23,%%v0 \n\t"
"vst %%v0,0(%%r1,%6) \n\t"
"vlrepg %%v24,16(%%r1,%1) \n\t"
"vlrepg %%v25,24(%%r1,%1) \n\t"
"vlrepg %%v26,16(%%r1,%2) \n\t"
"vlrepg %%v27,24(%%r1,%2) \n\t"
"vl %%v0,16(%%r1,%6) \n\t"
"vfmadb %%v0,%%v24,%%v16,%%v0 \n\t"
"vfmadb %%v0,%%v25,%%v20,%%v0 \n\t"
"vfmadb %%v0,%%v26,%%v17,%%v0 \n\t"
"vfmadb %%v0,%%v27,%%v21,%%v0 \n\t"
"vlrepg %%v28,16(%%r1,%3) \n\t"
"vlrepg %%v29,24(%%r1,%3) \n\t"
"vlrepg %%v30,16(%%r1,%4) \n\t"
"vlrepg %%v31,24(%%r1,%4) \n\t"
"vfmadb %%v0,%%v28,%%v18,%%v0 \n\t"
"vfmadb %%v0,%%v29,%%v22,%%v0 \n\t"
"vfmadb %%v0,%%v30,%%v19,%%v0 \n\t"
"vfmadb %%v0,%%v31,%%v23,%%v0 \n\t"
"vst %%v0,16(%%r1,%6) \n\t"
"vfmadb %%v1,%%v31,%%v21,%%v1\n\t"
"vlrepg %%v24,0(%%r1,%[ap2])\n\t"
"vlrepg %%v25,8(%%r1,%[ap2])\n\t"
"vlrepg %%v26,0(%%r1,%[ap3])\n\t"
"vlrepg %%v27,8(%%r1,%[ap3])\n\t"
"vlrepg %%v28,16(%%r1,%[ap2])\n\t"
"vlrepg %%v29,24(%%r1,%[ap2])\n\t"
"vlrepg %%v30,16(%%r1,%[ap3])\n\t"
"vlrepg %%v31,24(%%r1,%[ap3])\n\t"
"vfmadb %%v0,%%v24,%%v18,%%v0\n\t"
"vfmadb %%v1,%%v28,%%v18,%%v1\n\t"
"vfmadb %%v0,%%v25,%%v22,%%v0\n\t"
"vfmadb %%v1,%%v29,%%v22,%%v1\n\t"
"vfmadb %%v0,%%v26,%%v19,%%v0\n\t"
"vfmadb %%v1,%%v30,%%v19,%%v1\n\t"
"vfmadb %%v0,%%v27,%%v23,%%v0\n\t"
"vfmadb %%v1,%%v31,%%v23,%%v1\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"vst %%v1,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vl %%v16,0(%3) \n\t"
"vl %%v17,16(%3) \n\t"
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vl %%v16,0(%[x])\n\t"
"vl %%v17,16(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v18,8(%3),0 \n\t"
"vleg %%v18,8(%[x]),0\n\t"
"wflcdb %%v18,%%v18\n\t"
"vleg %%v18,0(%3),1 \n\t"
"vleg %%v19,24(%3),0 \n\t"
"vleg %%v18,0(%[x]),1\n\t"
"vleg %%v19,24(%[x]),0\n\t"
"wflcdb %%v19,%%v19\n\t"
"vleg %%v19,16(%3),1 \n\t"
"vleg %%v19,16(%[x]),1\n\t"
#else
"vleg %%v18,0(%3),1 \n\t"
"vleg %%v18,0(%[x]),1\n\t"
"vflcdb %%v18,%%v18\n\t"
"vleg %%v18,8(%3),0 \n\t"
"vleg %%v19,16(%3),1 \n\t"
"vleg %%v18,8(%[x]),0\n\t"
"vleg %%v19,16(%[x]),1\n\t"
"vflcdb %%v19,%%v19\n\t"
"vleg %%v19,24(%3),0 \n\t"
"vleg %%v19,24(%[x]),0\n\t"
#endif
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%4) \n\t"
"vlrepg %%v20,0(%%r1,%1) \n\t"
"vlrepg %%v21,8(%%r1,%1) \n\t"
"vlrepg %%v22,0(%%r1,%2) \n\t"
"vlrepg %%v23,8(%%r1,%2) \n\t"
"vl %%v0,0(%%r1,%4) \n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vl %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v20,0(%%r1,%[ap0])\n\t"
"vlrepg %%v21,8(%%r1,%[ap0])\n\t"
"vlrepg %%v22,0(%%r1,%[ap1])\n\t"
"vlrepg %%v23,8(%%r1,%[ap1])\n\t"
"vlrepg %%v24,16(%%r1,%[ap0])\n\t"
"vlrepg %%v25,24(%%r1,%[ap0])\n\t"
"vlrepg %%v26,16(%%r1,%[ap1])\n\t"
"vlrepg %%v27,24(%%r1,%[ap1])\n\t"
"vfmadb %%v0,%%v20,%%v16,%%v0\n\t"
"vfmadb %%v1,%%v24,%%v16,%%v1\n\t"
"vfmadb %%v0,%%v21,%%v18,%%v0\n\t"
"vfmadb %%v1,%%v25,%%v18,%%v1\n\t"
"vfmadb %%v0,%%v22,%%v17,%%v0\n\t"
"vfmadb %%v1,%%v26,%%v17,%%v1\n\t"
"vfmadb %%v0,%%v23,%%v19,%%v0\n\t"
"vst %%v0,0(%%r1,%4) \n\t"
"vlrepg %%v20,16(%%r1,%1) \n\t"
"vlrepg %%v21,24(%%r1,%1) \n\t"
"vlrepg %%v22,16(%%r1,%2) \n\t"
"vlrepg %%v23,24(%%r1,%2) \n\t"
"vl %%v0,16(%%r1,%4) \n\t"
"vfmadb %%v0,%%v20,%%v16,%%v0 \n\t"
"vfmadb %%v0,%%v21,%%v18,%%v0 \n\t"
"vfmadb %%v0,%%v22,%%v17,%%v0 \n\t"
"vfmadb %%v0,%%v23,%%v19,%%v0 \n\t"
"vst %%v0,16(%%r1,%4) \n\t"
"vfmadb %%v1,%%v27,%%v19,%%v1\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"vst %%v1,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27");
}
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vl %%v16,0(%2) \n\t"
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
__asm__("vl %%v16,0(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v17,8(%2),0 \n\t"
"vleg %%v17,8(%[x]),0\n\t"
"wflcdb %%v17,%%v17\n\t"
"vleg %%v17,0(%2),1 \n\t"
"vleg %%v17,0(%[x]),1\n\t"
#else
"vleg %%v17,0(%2),1 \n\t"
"vleg %%v17,0(%[x]),1\n\t"
"vflcdb %%v17,%%v17\n\t"
"vleg %%v17,8(%2),0 \n\t"
"vleg %%v17,8(%[x]),0\n\t"
#endif
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"
"vlrepg %%v18,0(%%r1,%1) \n\t"
"vlrepg %%v19,8(%%r1,%1) \n\t"
"vl %%v0,0(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vl %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v18,0(%%r1,%[ap])\n\t"
"vlrepg %%v19,8(%%r1,%[ap])\n\t"
"vlrepg %%v20,16(%%r1,%[ap])\n\t"
"vlrepg %%v21,24(%%r1,%[ap])\n\t"
"vfmadb %%v0,%%v18,%%v16,%%v0\n\t"
"vfmadb %%v1,%%v20,%%v16,%%v1\n\t"
"vfmadb %%v0,%%v19,%%v17,%%v0\n\t"
"vst %%v0,0(%%r1,%3) \n\t"
"vlrepg %%v18,16(%%r1,%1) \n\t"
"vlrepg %%v19,24(%%r1,%1) \n\t"
"vl %%v0,16(%%r1,%3) \n\t"
"vfmadb %%v0,%%v18,%%v16,%%v0 \n\t"
"vfmadb %%v0,%%v19,%%v17,%%v0 \n\t"
"vst %%v0,16(%%r1,%3) \n\t"
"vfmadb %%v1,%%v21,%%v17,%%v1\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"vst %%v1,16(%%r1,%[y])\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21");
}
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i)
{
__asm__ volatile (
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r,
FLOAT alpha_i) {
__asm__(
#if !defined(XCONJ)
"vlrepg %%v0,%3 \n\t"
"vleg %%v1,%4,0 \n\t"
"vlrepg %%v0,%[alpha_r]\n\t"
"vleg %%v1,%[alpha_i],0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,%4,1 \n\t"
"vleg %%v1,%[alpha_i],1\n\t"
#else
"vleg %%v0,%3,1 \n\t"
"vleg %%v0,%[alpha_r],1\n\t"
"vflcdb %%v0,%%v0\n\t"
"vleg %%v0,%3,0 \n\t"
"vlrepg %%v1,%4 \n\t"
"vleg %%v0,%[alpha_r],0\n\t"
"vlrepg %%v1,%[alpha_i]\n\t"
#endif
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,2 \n\t"
"srlg %[n],%[n],2\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%[src])\n\t"
"pfd 2,1024(%%r1,%[dest])\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v18,32(%%r1,%[src])\n\t"
"vl %%v19,48(%%r1,%[src])\n\t"
"vl %%v20,0(%%r1,%[dest])\n\t"
"vl %%v21,16(%%r1,%[dest])\n\t"
"vl %%v22,32(%%r1,%[dest])\n\t"
"vl %%v23,48(%%r1,%[dest])\n\t"
"vpdi %%v24,%%v16,%%v16,4\n\t"
"vpdi %%v25,%%v17,%%v17,4\n\t"
"vpdi %%v26,%%v18,%%v18,4\n\t"
"vpdi %%v27,%%v19,%%v19,4\n\t"
"vfmadb %%v28,%%v16,%%v0,%%v20\n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21\n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22\n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23\n\t"
"vfmadb %%v28,%%v24,%%v1,%%v28\n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29\n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30\n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31\n\t"
"vst %%v28,0(%%r1,%2) \n\t"
"vst %%v29,16(%%r1,%2) \n\t"
"vst %%v30,32(%%r1,%2) \n\t"
"vst %%v31,48(%%r1,%2) \n\t"
"vst %%v28,0(%%r1,%[dest])\n\t"
"vst %%v29,16(%%r1,%[dest])\n\t"
"vst %%v30,32(%%r1,%[dest])\n\t"
"vst %%v31,48(%%r1,%[dest])\n\t"
"agfi %%r1,64\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n)
: [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src),
[src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i)
{
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,
FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
if ( inc_dest != 2 )
{
if (inc_dest != 2) {
FLOAT temp_r;
FLOAT temp_i;
for ( i=0; i<n; i++ )
{
for (i = 0; i < n; i++) {
#if !defined(XCONJ)
temp_r = alpha_r * src[0] - alpha_i * src[1];
temp_i = alpha_r * src[1] + alpha_i * src[0];
@ -315,8 +298,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
add_y_4(n, src, dest, alpha_r, alpha_i);
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i;
FLOAT *a_ptr;
FLOAT *x_ptr;
@ -330,8 +314,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
BLASLONG lda4;
FLOAT xbuffer[8], *ybuffer;
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
if (m < 1)
return (0);
if (n < 1)
return (0);
ybuffer = buffer;
@ -351,13 +337,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
while (NB == NBMAX) {
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
if (m1 < 0) {
if (m2 == 0)
break;
NB = m2;
}
@ -370,11 +355,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
//zero_y(NB,ybuffer);
memset(ybuffer, 0, NB * 16);
if ( inc_x == 2 )
{
if (inc_x == 2) {
for( i = 0; i < n1 ; i++)
{
for (i = 0; i < n1; i++) {
zgemv_kernel_4x4(NB, ap, x_ptr, ybuffer);
ap[0] += lda4;
ap[1] += lda4;
@ -384,27 +367,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 8;
}
if ( n2 & 2 )
{
if (n2 & 2) {
zgemv_kernel_4x2(NB, ap, x_ptr, ybuffer);
x_ptr += 4;
a_ptr += 2 * lda;
}
if ( n2 & 1 )
{
if (n2 & 1) {
zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
/* x_ptr += 2;
a_ptr += lda; */
}
}
else
{
} else {
for( i = 0; i < n1 ; i++)
{
for (i = 0; i < n1; i++) {
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
@ -427,8 +405,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
a_ptr += lda4;
}
for( i = 0; i < n2 ; i++)
{
for (i = 0; i < n2; i++) {
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
@ -444,21 +421,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
y_ptr += NB * inc_y;
}
if ( m3 == 0 ) return(0);
if (m3 == 0)
return (0);
if ( m3 == 1 )
{
if (m3 == 1) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r = 0.0;
FLOAT temp_i = 0.0;
if ( lda == 2 && inc_x == 2 )
{
if (lda == 2 && inc_x == 2) {
for( i=0 ; i < (n & -2); i+=2 )
{
for (i = 0; i < (n & -2); i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -475,10 +449,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 4;
}
for( ; i < n; i++ )
{
for (; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -491,13 +462,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 2;
}
} else {
}
else
{
for( i = 0; i < n; i++ )
{
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -521,8 +488,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
return (0);
}
if ( m3 == 2 )
{
if (m3 == 2) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
@ -530,11 +496,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
if ( lda == 4 && inc_x == 2 )
{
if (lda == 4 && inc_x == 2) {
for( i = 0; i < (n & -2); i+=2 )
{
for (i = 0; i < (n & -2); i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
@ -564,9 +528,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 4;
}
for( ; i < n; i++ )
{
for (; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -583,13 +545,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 2;
}
} else {
}
else
{
for( i=0 ; i < n; i++ )
{
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -606,7 +564,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
@ -624,9 +581,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
return (0);
}
if ( m3 == 3 )
{
if (m3 == 3) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
@ -636,11 +591,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
FLOAT temp_r2 = 0.0;
FLOAT temp_i2 = 0.0;
if ( lda == 6 && inc_x == 2 )
{
if (lda == 6 && inc_x == 2) {
for( i=0 ; i < n; i++ )
{
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -661,13 +614,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 2;
}
} else {
}
else
{
for( i = 0; i < n; i++ )
{
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -29,106 +29,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NBMAX 1024
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vzero %%v16 \n\t"
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"vzero %%v18\n\t"
"vzero %%v19\n\t"
"vzero %%v20\n\t"
"vzero %%v21\n\t"
"vzero %%v22\n\t"
"vzero %%v23\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 1,1024(%%r1,%5) \n\t"
"vl %%v20,0(%%r1,%5) \n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v21,8(%%r1,%5),0 \n\t"
"wflcdb %%v21,%%v21 \n\t"
"vleg %%v21,0(%%r1,%5),1 \n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
#else
"vleg %%v21,0(%%r1,%5),1 \n\t"
"vflcdb %%v21,%%v21 \n\t"
"vleg %%v21,8(%%r1,%5),0 \n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
#endif
"vlrepg %%v24,0(%%r1,%1) \n\t"
"vlrepg %%v25,8(%%r1,%1) \n\t"
"vlrepg %%v26,0(%%r1,%2) \n\t"
"vlrepg %%v27,8(%%r1,%2) \n\t"
"vfmadb %%v16,%%v24,%%v20,%%v16 \n\t"
"vfmadb %%v16,%%v25,%%v21,%%v16 \n\t"
"vfmadb %%v17,%%v26,%%v20,%%v17 \n\t"
"vfmadb %%v17,%%v27,%%v21,%%v17 \n\t"
"vlrepg %%v28,0(%%r1,%3) \n\t"
"vlrepg %%v29,8(%%r1,%3) \n\t"
"vlrepg %%v30,0(%%r1,%4) \n\t"
"vlrepg %%v31,8(%%r1,%4) \n\t"
"vfmadb %%v18,%%v28,%%v20,%%v18 \n\t"
"vfmadb %%v18,%%v29,%%v21,%%v18 \n\t"
"vfmadb %%v19,%%v30,%%v20,%%v19 \n\t"
"vfmadb %%v19,%%v31,%%v21,%%v19 \n\t"
"vl %%v22,16(%%r1,%5) \n\t"
"vlrepg %%v24,0(%%r1,%[ap0])\n\t"
"vlrepg %%v25,8(%%r1,%[ap0])\n\t"
"vlrepg %%v26,0(%%r1,%[ap1])\n\t"
"vlrepg %%v27,8(%%r1,%[ap1])\n\t"
"vlrepg %%v28,0(%%r1,%[ap2])\n\t"
"vlrepg %%v29,8(%%r1,%[ap2])\n\t"
"vlrepg %%v30,0(%%r1,%[ap3])\n\t"
"vlrepg %%v31,8(%%r1,%[ap3])\n\t"
"vfmadb %%v16,%%v24,%%v0,%%v16\n\t"
"vfmadb %%v20,%%v25,%%v1,%%v20\n\t"
"vfmadb %%v17,%%v26,%%v0,%%v17\n\t"
"vfmadb %%v21,%%v27,%%v1,%%v21\n\t"
"vfmadb %%v18,%%v28,%%v0,%%v18\n\t"
"vfmadb %%v22,%%v29,%%v1,%%v22\n\t"
"vfmadb %%v19,%%v30,%%v0,%%v19\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"vl %%v0,16(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v23,24(%%r1,%5),0 \n\t"
"wflcdb %%v23,%%v23 \n\t"
"vleg %%v23,16(%%r1,%5),1 \n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
#else
"vleg %%v23,16(%%r1,%5),1 \n\t"
"vflcdb %%v23,%%v23 \n\t"
"vleg %%v23,24(%%r1,%5),0 \n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
#endif
"vlrepg %%v24,16(%%r1,%1) \n\t"
"vlrepg %%v25,24(%%r1,%1) \n\t"
"vlrepg %%v26,16(%%r1,%2) \n\t"
"vlrepg %%v27,24(%%r1,%2) \n\t"
"vfmadb %%v16,%%v24,%%v22,%%v16 \n\t"
"vfmadb %%v16,%%v25,%%v23,%%v16 \n\t"
"vfmadb %%v17,%%v26,%%v22,%%v17 \n\t"
"vfmadb %%v17,%%v27,%%v23,%%v17 \n\t"
"vlrepg %%v28,16(%%r1,%3) \n\t"
"vlrepg %%v29,24(%%r1,%3) \n\t"
"vlrepg %%v30,16(%%r1,%4) \n\t"
"vlrepg %%v31,24(%%r1,%4) \n\t"
"vfmadb %%v18,%%v28,%%v22,%%v18 \n\t"
"vfmadb %%v18,%%v29,%%v23,%%v18 \n\t"
"vfmadb %%v19,%%v30,%%v22,%%v19 \n\t"
"vfmadb %%v19,%%v31,%%v23,%%v19 \n\t"
"vlrepg %%v24,16(%%r1,%[ap0])\n\t"
"vlrepg %%v25,24(%%r1,%[ap0])\n\t"
"vlrepg %%v26,16(%%r1,%[ap1])\n\t"
"vlrepg %%v27,24(%%r1,%[ap1])\n\t"
"vlrepg %%v28,16(%%r1,%[ap2])\n\t"
"vlrepg %%v29,24(%%r1,%[ap2])\n\t"
"vlrepg %%v30,16(%%r1,%[ap3])\n\t"
"vlrepg %%v31,24(%%r1,%[ap3])\n\t"
"vfmadb %%v16,%%v24,%%v0,%%v16\n\t"
"vfmadb %%v20,%%v25,%%v1,%%v20\n\t"
"vfmadb %%v17,%%v26,%%v0,%%v17\n\t"
"vfmadb %%v21,%%v27,%%v1,%%v21\n\t"
"vfmadb %%v18,%%v28,%%v0,%%v18\n\t"
"vfmadb %%v22,%%v29,%%v1,%%v22\n\t"
"vfmadb %%v19,%%v30,%%v0,%%v19\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,0b \n\t"
"brctg %[n],0b\n\t"
"vfadb %%v16,%%v16,%%v20\n\t"
"vfadb %%v17,%%v17,%%v21\n\t"
"vfadb %%v18,%%v18,%%v22\n\t"
"vfadb %%v19,%%v19,%%v23\n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t"
#if !defined(XCONJ)
"vlrepg %%v24,0(%7) \n\t"
"vleg %%v25,8(%7),0 \n\t"
"vlrepg %%v24,0(%[alpha])\n\t"
"vleg %%v25,8(%[alpha]),0\n\t"
"wflcdb %%v25,%%v25\n\t"
"vleg %%v25,8(%7),1 \n\t"
"vleg %%v25,8(%[alpha]),1\n\t"
#else
"vleg %%v24,0(%7),1 \n\t"
"vleg %%v24,0(%[alpha]),1\n\t"
"vflcdb %%v24,%%v24\n\t"
"vleg %%v24,0(%7),0 \n\t"
"vlrepg %%v25,8(%7) \n\t"
"vleg %%v24,0(%[alpha]),0\n\t"
"vlrepg %%v25,8(%[alpha])\n\t"
#endif
"vl %%v26,0(%6) \n\t"
"vl %%v27,16(%6) \n\t"
"vl %%v28,32(%6) \n\t"
"vl %%v29,48(%6) \n\t"
"vl %%v26,0(%[y])\n\t"
"vl %%v27,16(%[y])\n\t"
"vl %%v28,32(%[y])\n\t"
"vl %%v29,48(%[y])\n\t"
"vfmadb %%v26,%%v16,%%v24,%%v26\n\t"
"vfmadb %%v26,%%v20,%%v25,%%v26\n\t"
"vfmadb %%v27,%%v17,%%v24,%%v27\n\t"
@ -137,174 +137,173 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *
"vfmadb %%v28,%%v22,%%v25,%%v28\n\t"
"vfmadb %%v29,%%v19,%%v24,%%v29\n\t"
"vfmadb %%v29,%%v23,%%v25,%%v29\n\t"
"vst %%v26,0(%6) \n\t"
"vst %%v27,16(%6) \n\t"
"vst %%v28,32(%6) \n\t"
"vst %%v29,48(%6) "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[8])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"vst %%v26,0(%[y])\n\t"
"vst %%v27,16(%[y])\n\t"
"vst %%v28,32(%[y])\n\t"
"vst %%v29,48(%[y])"
: "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vzero %%v16 \n\t"
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
register FLOAT *ap0 = ap[0];
register FLOAT *ap1 = ap[1];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"vzero %%v18\n\t"
"vzero %%v19\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v19,8(%%r1,%3),0 \n\t"
"wflcdb %%v19,%%v19 \n\t"
"vleg %%v19,0(%%r1,%3),1 \n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
#else
"vleg %%v19,0(%%r1,%3),1 \n\t"
"vflcdb %%v19,%%v19 \n\t"
"vleg %%v19,8(%%r1,%3),0 \n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
#endif
"vlrepg %%v20,0(%%r1,%1) \n\t"
"vlrepg %%v21,8(%%r1,%1) \n\t"
"vlrepg %%v22,0(%%r1,%2) \n\t"
"vlrepg %%v23,8(%%r1,%2) \n\t"
"vfmadb %%v16,%%v20,%%v18,%%v16 \n\t"
"vfmadb %%v16,%%v21,%%v19,%%v16 \n\t"
"vfmadb %%v17,%%v22,%%v18,%%v17 \n\t"
"vfmadb %%v17,%%v23,%%v19,%%v17 \n\t"
"vl %%v18,16(%%r1,%3) \n\t"
"vlrepg %%v20,0(%%r1,%[ap0])\n\t"
"vlrepg %%v21,8(%%r1,%[ap0])\n\t"
"vlrepg %%v22,0(%%r1,%[ap1])\n\t"
"vlrepg %%v23,8(%%r1,%[ap1])\n\t"
"vfmadb %%v16,%%v20,%%v0,%%v16\n\t"
"vfmadb %%v18,%%v21,%%v1,%%v18\n\t"
"vfmadb %%v17,%%v22,%%v0,%%v17\n\t"
"vfmadb %%v19,%%v23,%%v1,%%v19\n\t"
"vl %%v0,16(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v19,24(%%r1,%3),0 \n\t"
"wflcdb %%v19,%%v19 \n\t"
"vleg %%v19,16(%%r1,%3),1 \n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
#else
"vleg %%v19,16(%%r1,%3),1 \n\t"
"vflcdb %%v19,%%v19 \n\t"
"vleg %%v19,24(%%r1,%3),0 \n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
#endif
"vlrepg %%v20,16(%%r1,%1) \n\t"
"vlrepg %%v21,24(%%r1,%1) \n\t"
"vlrepg %%v22,16(%%r1,%2) \n\t"
"vlrepg %%v23,24(%%r1,%2) \n\t"
"vfmadb %%v16,%%v20,%%v18,%%v16 \n\t"
"vfmadb %%v16,%%v21,%%v19,%%v16 \n\t"
"vfmadb %%v17,%%v22,%%v18,%%v17 \n\t"
"vfmadb %%v17,%%v23,%%v19,%%v17 \n\t"
"vlrepg %%v20,16(%%r1,%[ap0])\n\t"
"vlrepg %%v21,24(%%r1,%[ap0])\n\t"
"vlrepg %%v22,16(%%r1,%[ap1])\n\t"
"vlrepg %%v23,24(%%r1,%[ap1])\n\t"
"vfmadb %%v16,%%v20,%%v0,%%v16\n\t"
"vfmadb %%v18,%%v21,%%v1,%%v18\n\t"
"vfmadb %%v17,%%v22,%%v0,%%v17\n\t"
"vfmadb %%v19,%%v23,%%v1,%%v19\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,0b \n\t"
"brctg %[n],0b\n\t"
"vfadb %%v16,%%v16,%%v18\n\t"
"vfadb %%v17,%%v17,%%v19\n\t"
"vpdi %%v18,%%v16,%%v16,4\n\t"
"vpdi %%v19,%%v17,%%v17,4\n\t"
#if !defined(XCONJ)
"vlrepg %%v20,0(%5) \n\t"
"vleg %%v21,8(%5),0 \n\t"
"vlrepg %%v20,0(%[alpha])\n\t"
"vleg %%v21,8(%[alpha]),0\n\t"
"wflcdb %%v21,%%v21\n\t"
"vleg %%v21,8(%5),1 \n\t"
"vleg %%v21,8(%[alpha]),1\n\t"
#else
"vleg %%v20,0(%5),1 \n\t"
"vleg %%v20,0(%[alpha]),1\n\t"
"vflcdb %%v20,%%v20\n\t"
"vleg %%v20,0(%5),0 \n\t"
"vlrepg %%v21,8(%5) \n\t"
"vleg %%v20,0(%[alpha]),0\n\t"
"vlrepg %%v21,8(%[alpha])\n\t"
#endif
"vl %%v22,0(%4) \n\t"
"vl %%v23,16(%4) \n\t"
"vl %%v22,0(%[y])\n\t"
"vl %%v23,16(%[y])\n\t"
"vfmadb %%v22,%%v16,%%v20,%%v22\n\t"
"vfmadb %%v22,%%v18,%%v21,%%v22\n\t"
"vfmadb %%v23,%%v17,%%v20,%%v23\n\t"
"vfmadb %%v23,%%v19,%%v21,%%v23\n\t"
"vst %%v22,0(%4) \n\t"
"vst %%v23,16(%4) \n\t"
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[4])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23"
);
"vst %%v22,0(%[y])\n\t"
"vst %%v23,16(%[y])\n\t"
: "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23");
}
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vzero %%v16 \n\t"
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
FLOAT *alpha) {
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t"
"xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t"
"srlg %[n],%[n],1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v18,8(%%r1,%2),0 \n\t"
"wflcdb %%v18,%%v18 \n\t"
"vleg %%v18,0(%%r1,%2),1 \n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
#else
"vleg %%v18,0(%%r1,%2),1 \n\t"
"vflcdb %%v18,%%v18 \n\t"
"vleg %%v18,8(%%r1,%2),0 \n\t"
"vleg %%v1,0(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%%r1,%[x]),0\n\t"
#endif
"vlrepg %%v19,0(%%r1,%1) \n\t"
"vlrepg %%v20,8(%%r1,%1) \n\t"
"vfmadb %%v16,%%v19,%%v17,%%v16 \n\t"
"vfmadb %%v16,%%v20,%%v18,%%v16 \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vlrepg %%v18,0(%%r1,%[ap])\n\t"
"vlrepg %%v19,8(%%r1,%[ap])\n\t"
"vfmadb %%v16,%%v18,%%v0,%%v16\n\t"
"vfmadb %%v17,%%v19,%%v1,%%v17\n\t"
"vl %%v0,16(%%r1,%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v18,24(%%r1,%2),0 \n\t"
"wflcdb %%v18,%%v18 \n\t"
"vleg %%v18,16(%%r1,%2),1 \n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
#else
"vleg %%v18,16(%%r1,%2),1 \n\t"
"vflcdb %%v18,%%v18 \n\t"
"vleg %%v18,24(%%r1,%2),0 \n\t"
"vleg %%v1,16(%%r1,%[x]),1\n\t"
"vflcdb %%v1,%%v1\n\t"
"vleg %%v1,24(%%r1,%[x]),0\n\t"
#endif
"vlrepg %%v19,16(%%r1,%1) \n\t"
"vlrepg %%v20,24(%%r1,%1) \n\t"
"vfmadb %%v16,%%v19,%%v17,%%v16 \n\t"
"vfmadb %%v16,%%v20,%%v18,%%v16 \n\t"
"vlrepg %%v18,16(%%r1,%[ap])\n\t"
"vlrepg %%v19,24(%%r1,%[ap])\n\t"
"vfmadb %%v16,%%v18,%%v0,%%v16\n\t"
"vfmadb %%v17,%%v19,%%v1,%%v17\n\t"
"agfi %%r1,32\n\t"
"brctg %%r0,0b \n\t"
"brctg %[n],0b\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vpdi %%v17,%%v16,%%v16,4\n\t"
#if !defined(XCONJ)
"vlrepg %%v18,0(%4) \n\t"
"vleg %%v19,8(%4),0 \n\t"
"vlrepg %%v18,0(%[alpha])\n\t"
"vleg %%v19,8(%[alpha]),0\n\t"
"wflcdb %%v19,%%v19\n\t"
"vleg %%v19,8(%4),1 \n\t"
"vleg %%v19,8(%[alpha]),1\n\t"
#else
"vleg %%v18,0(%4),1 \n\t"
"vleg %%v18,0(%[alpha]),1\n\t"
"vflcdb %%v18,%%v18\n\t"
"vleg %%v18,0(%4),0 \n\t"
"vlrepg %%v19,8(%4) \n\t"
"vleg %%v18,0(%[alpha]),0\n\t"
"vlrepg %%v19,8(%[alpha])\n\t"
#endif
"vl %%v20,0(%3) \n\t"
"vfmadb %%v20,%%v16,%%v18,%%v20 \n\t"
"vfmadb %%v20,%%v17,%%v19,%%v20 \n\t"
"vst %%v20,0(%3) \n\t"
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[2])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v16","v17","v18","v19","v20"
);
"vl %%v0,0(%[y])\n\t"
"vfmadb %%v0,%%v16,%%v18,%%v0\n\t"
"vfmadb %%v0,%%v17,%%v19,%%v0\n\t"
"vst %%v0,0(%[y])\n\t"
: "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19");
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
BLASLONG i;
for ( i=0; i<n; i++ )
{
for (i = 0; i < n; i++) {
*dest = *src;
*(dest + 1) = *(src + 1);
dest += 2;
@ -312,8 +311,9 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
@ -329,8 +329,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT ybuffer[8], *xbuffer;
FLOAT alpha[2];
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
if (m < 1)
return (0);
if (n < 1)
return (0);
inc_x <<= 1;
inc_y <<= 1;
@ -351,13 +353,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
BLASLONG NB = NBMAX;
while ( NB == NBMAX )
{
while (NB == NBMAX) {
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
if (m1 < 0) {
if (m2 == 0)
break;
NB = m2;
}
@ -373,11 +374,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
else
xbuffer = x_ptr;
if ( inc_y == 2 )
{
if (inc_y == 2) {
for( i = 0; i < n1 ; i++)
{
for (i = 0; i < n1; i++) {
zgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha);
ap[0] += lda4;
ap[1] += lda4;
@ -388,28 +387,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
}
if ( n2 & 2 )
{
if (n2 & 2) {
zgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha);
a_ptr += lda * 2;
y_ptr += 4;
}
if ( n2 & 1 )
{
if (n2 & 1) {
zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
/* a_ptr += lda;
y_ptr += 2; */
}
}
else
{
} else {
for( i = 0; i < n1 ; i++)
{
for (i = 0; i < n1; i++) {
memset(ybuffer, 0, sizeof(ybuffer));
zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha);
ap[0] += lda4;
@ -433,8 +427,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
}
for( i = 0; i < n2 ; i++)
{
for (i = 0; i < n2; i++) {
memset(ybuffer, 0, sizeof(ybuffer));
zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha);
a_ptr += lda;
@ -449,17 +442,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
x += NB * inc_x;
}
if ( m3 == 0 ) return(0);
if (m3 == 0)
return (0);
x_ptr = x;
j = 0;
a_ptr = a;
y_ptr = y;
if ( m3 == 3 )
{
if (m3 == 3) {
FLOAT temp_r;
FLOAT temp_i;
@ -471,8 +462,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
x_ptr += inc_x;
FLOAT x4 = x_ptr[0];
FLOAT x5 = x_ptr[1];
while ( j < n)
{
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -505,9 +495,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
return (0);
}
if ( m3 == 2 )
{
if (m3 == 2) {
FLOAT temp_r;
FLOAT temp_i;
@ -521,8 +509,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT ar = alpha[0];
FLOAT ai = alpha[1];
while ( j < ( n & -2 ))
{
while (j < (n & -2)) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -565,9 +552,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
j += 2;
}
while ( j < n)
{
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -597,9 +582,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
return (0);
}
if ( m3 == 1 )
{
if (m3 == 1) {
FLOAT temp_r;
FLOAT temp_i;
@ -610,8 +593,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT ar = alpha[0];
FLOAT ai = alpha[1];
while ( j < ( n & -2 ))
{
while (j < (n & -2)) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -646,8 +628,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
j += 2;
}
while ( j < n)
{
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,25 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepg %%v0,%3 \n\t"
"vlrepg %%v1,%4 \n\t"
"srlg %%r0,%0,4 \n\t"
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
__asm__("vlrepg %%v0,%[c]\n\t"
"vlrepg %%v1,%[s]\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -63,25 +60,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -99,25 +93,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -135,25 +126,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%[y])\n\t"
"vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v19, 240(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -171,40 +159,39 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT c, FLOAT s) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;
if ( n <= 0 ) return(0);
if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1) )
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
if (n1 > 0) {
FLOAT cosa, sina;
cosa = c;
sina = s;
@ -213,8 +200,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
ix = 2 * n1;
}
while(i < n)
{
while (i < n) {
temp[0] = c * x[ix] + s * y[ix];
temp[1] = c * x[ix + 1] + s * y[ix + 1];
y[ix] = c * y[ix] - s * x[ix];
@ -227,14 +213,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
}
}
else
{
} else {
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while(i < n)
{
while (i < n) {
temp[0] = c * x[ix] + s * y[iy];
temp[1] = c * x[ix + 1] + s * y[iy + 1];
y[iy] = c * y[iy] - s * x[ix];
@ -252,5 +234,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
return (0);
}

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013 - 2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,26 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepg %%v0,0(%1) \n\t"
"vleg %%v1,8(%1),0 \n\t"
static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlrepg %%v0,0(%[alpha])\n\t"
"vleg %%v1,8(%[alpha]),0\n\t"
"wflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%1),1 \n\t"
"srlg %%r0,%0,3 \n\t"
"vleg %%v1,8(%[alpha]),1\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vpdi %%v24,%%v16,%%v16,4\n\t"
"vpdi %%v25,%%v17,%%v17,4\n\t"
"vpdi %%v26,%%v18,%%v18,4\n\t"
@ -55,7 +52,6 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vpdi %%v29,%%v21,%%v21,4\n\t"
"vpdi %%v30,%%v22,%%v22,4\n\t"
"vpdi %%v31,%%v23,%%v23,4\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
@ -72,43 +68,40 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vfmadb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmadb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
}
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vleg %%v0,8(%1),0 \n\t"
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vleg %%v0,8(%[alpha]),0\n\t"
"wflcdb %%v0,%%v0\n\t"
"vleg %%v0,8(%1),1 \n\t"
"srlg %%r0,%0,3 \n\t"
"vleg %%v0,8(%[alpha]),1\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vpdi %%v16,%%v16,%%v16,4\n\t"
"vpdi %%v17,%%v17,%%v17,4\n\t"
"vpdi %%v18,%%v18,%%v18,4\n\t"
@ -117,7 +110,6 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vpdi %%v21,%%v21,%%v21,4\n\t"
"vpdi %%v22,%%v22,%%v22,4\n\t"
"vpdi %%v23,%%v23,%%v23,4\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
@ -126,42 +118,37 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
}
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepg %%v0,0(%1) \n\t"
"srlg %%r0,%0,3 \n\t"
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
__asm__("vlrepg %%v0,0(%[alpha])\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
@ -170,55 +157,46 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
}
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,3 \n\t"
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
__asm__("vzero %%v0\n\t"
"srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v0,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
"brctg %[n],0b"
: "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x)
: "cc", "r1", "v0");
}
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
{
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
BLASLONG inc_x) {
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
@ -226,8 +204,7 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4)
{
for (i = 0; i < n; i += 4) {
t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
@ -247,7 +224,9 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
}
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
FLOAT temp0;
FLOAT temp1;
@ -307,13 +286,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
}
}
} else {
if (da_i == 0.0) {
BLASLONG n1 = n & -2;
@ -368,7 +344,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
return (0);
}
BLASLONG n1 = n & -8;
if (n1 > 0) {
@ -380,8 +355,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
zscal_kernel_8_zero(n1, x);
else
zscal_kernel_8_zero_r(n1, alpha, x);
else
if (da_i == 0)
else if (da_i == 0)
zscal_kernel_8_zero_i(n1, alpha, x);
else
zscal_kernel_8(n1, alpha, x);
@ -390,7 +364,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
j = n1;
}
if (da_r == 0.0) {
if (da_i == 0.0) {

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,114 +27,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,4 \n\t"
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
__asm__("srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"
"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"
"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v7, 112(%%r1,%[y])\n\t"
"vst %%v0, 0(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v7, 112(%%r1,%[x])\n\t"
"vl %%v0, 128(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v7, 240(%%r1,%[y])\n\t"
"vst %%v0, 128(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v31, 240(%%r1,%[y])\n\t"
"agfi %%r1,256\n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
}
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp[2];
BLASLONG inc_x2, inc_y2;
if ( n <= 0 ) return(0);
if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 ))
{
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
if (n1 > 0) {
zswap_kernel_16(n1, x, y);
i = n1;
ix = 2 * n1;
iy = 2 * n1;
}
while(i < n)
{
while (i < n) {
temp[0] = x[ix];
temp[1] = x[ix + 1];
@ -147,19 +141,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
iy += 2;
i++;
}
}
else
{
} else {
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
while(i < n)
{
while (i < n) {
temp[0] = x[ix];
temp[1] = x[ix + 1];
@ -177,7 +166,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
}
return (0);
}