Merge pull request #2012 from maamountki/z14

[ZARCH] Many improvements
This commit is contained in:
Martin Kroeker 2019-02-13 20:15:56 +01:00 committed by GitHub
commit 76bb74fcd4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
67 changed files with 13503 additions and 14618 deletions

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amax;
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) __asm__("vlef %%v0,0(%[x]),0\n\t"
{ "vlef %%v16,4(%[x]),0\n\t"
FLOAT amax; "vlef %%v0,8(%[x]),1\n\t"
"vlef %%v16,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v16,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v16,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t"
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,8,4\n\t"
"vleib %%v1,9,5\n\t"
"vleib %%v1,10,6\n\t"
"vleib %%v1,11,7\n\t"
"vleib %%v1,16,8\n\t"
"vleib %%v1,17,9\n\t"
"vleib %%v1,18,10\n\t"
"vleib %%v1,19,11\n\t"
"vleib %%v1,24,12\n\t"
"vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%[x])\n\t"
"vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%[x])\n\t"
"vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%[x])\n\t"
"vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%[x])\n\t"
"vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t"
"vflpsb %%v19,%%v19\n\t"
"vflpsb %%v20,%%v20\n\t"
"vflpsb %%v21,%%v21\n\t"
"vflpsb %%v22,%%v22\n\t"
"vflpsb %%v23,%%v23\n\t"
"vflpsb %%v24,%%v24\n\t"
"vflpsb %%v25,%%v25\n\t"
"vflpsb %%v26,%%v26\n\t"
"vflpsb %%v27,%%v27\n\t"
"vflpsb %%v28,%%v28\n\t"
"vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t"
"vfasb %%v22,%%v22,%%v23\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t"
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
"ler %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
__asm__ volatile ( return amax;
"vlef %%v0,0(%2),0 \n\t"
"vlef %%v16,4(%2),0 \n\t"
"vlef %%v0,8(%2),1 \n\t"
"vlef %%v16,12(%2),1 \n\t"
"vlef %%v0,16(%2),2 \n\t"
"vlef %%v16,20(%2),2 \n\t"
"vlef %%v0,24(%2),3 \n\t"
"vlef %%v16,28(%2),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vleib %%v1,0,0 \n\t"
"vleib %%v1,1,1 \n\t"
"vleib %%v1,2,2 \n\t"
"vleib %%v1,3,3 \n\t"
"vleib %%v1,8,4 \n\t"
"vleib %%v1,9,5 \n\t"
"vleib %%v1,10,6 \n\t"
"vleib %%v1,11,7 \n\t"
"vleib %%v1,16,8 \n\t"
"vleib %%v1,17,9 \n\t"
"vleib %%v1,18,10 \n\t"
"vleib %%v1,19,11 \n\t"
"vleib %%v1,24,12 \n\t"
"vleib %%v1,25,13 \n\t"
"vleib %%v1,26,14 \n\t"
"vleib %%v1,27,15 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v2,16(%%r1,%2) \n\t"
"vpkg %%v17,%%v16,%%v2 \n\t"
"vperm %%v16,%%v16,%%v2,%%v1 \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v2,48(%%r1,%2) \n\t"
"vpkg %%v19,%%v18,%%v2 \n\t"
"vperm %%v18,%%v18,%%v2,%%v1 \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v2,80(%%r1,%2) \n\t"
"vpkg %%v21,%%v20,%%v2 \n\t"
"vperm %%v20,%%v20,%%v2,%%v1 \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v2,112(%%r1,%2) \n\t"
"vpkg %%v23,%%v22,%%v2 \n\t"
"vperm %%v22,%%v22,%%v2,%%v1 \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v2,144(%%r1,%2) \n\t"
"vpkg %%v25,%%v24,%%v2 \n\t"
"vperm %%v24,%%v24,%%v2,%%v1 \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v2,176(%%r1,%2) \n\t"
"vpkg %%v27,%%v26,%%v2 \n\t"
"vperm %%v26,%%v26,%%v2,%%v1 \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v2,208(%%r1,%2) \n\t"
"vpkg %%v29,%%v28,%%v2 \n\t"
"vperm %%v28,%%v28,%%v2,%%v1 \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v2,240(%%r1,%2) \n\t"
"vpkg %%v31,%%v30,%%v2 \n\t"
"vperm %%v30,%%v30,%%v2,%%v1 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vflpsb %%v17,%%v17 \n\t"
"vflpsb %%v18,%%v18 \n\t"
"vflpsb %%v19,%%v19 \n\t"
"vflpsb %%v20,%%v20 \n\t"
"vflpsb %%v21,%%v21 \n\t"
"vflpsb %%v22,%%v22 \n\t"
"vflpsb %%v23,%%v23 \n\t"
"vflpsb %%v24,%%v24 \n\t"
"vflpsb %%v25,%%v25 \n\t"
"vflpsb %%v26,%%v26 \n\t"
"vflpsb %%v27,%%v27 \n\t"
"vflpsb %%v28,%%v28 \n\t"
"vflpsb %%v29,%%v29 \n\t"
"vflpsb %%v30,%%v30 \n\t"
"vflpsb %%v31,%%v31 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v18,%%v18,%%v19 \n\t"
"vfasb %%v20,%%v20,%%v21 \n\t"
"vfasb %%v22,%%v22,%%v23 \n\t"
"vfasb %%v24,%%v24,%%v25 \n\t"
"vfasb %%v26,%%v26,%%v27 \n\t"
"vfasb %%v28,%%v28,%%v29 \n\t"
"vfasb %%v30,%%v30,%%v31 \n\t"
"vfmaxsb %%v16,%%v16,%%v24,0 \n\t"
"vfmaxsb %%v18,%%v18,%%v26,0 \n\t"
"vfmaxsb %%v20,%%v20,%%v28,0 \n\t"
"vfmaxsb %%v22,%%v22,%%v30,0 \n\t"
"vfmaxsb %%v16,%%v16,%%v20,0 \n\t"
"vfmaxsb %%v18,%%v18,%%v22,0 \n\t"
"vfmaxsb %%v16,%%v16,%%v18,0 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"ler %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amax;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0; BLASLONG ix = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
maxf = camax_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
maxf=CABS1(x,0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (maxf);
maxf = camax_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
} else { } else {
maxf = CABS1(x, 0);
maxf=CABS1(x,0); ix += 2;
inc_x2 = 2 * inc_x; i++;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) > maxf) {
maxf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) > maxf) {
maxf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) > maxf) {
maxf = CABS1(x,ix+inc_x2*3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (maxf);
} }
while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (maxf);
} else {
maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) > maxf) {
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += inc_x2;
i++;
}
return (maxf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT amin;
static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) __asm__("vlef %%v0,0(%[x]),0\n\t"
{ "vlef %%v16,4(%[x]),0\n\t"
FLOAT amin; "vlef %%v0,8(%[x]),1\n\t"
"vlef %%v16,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v16,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v16,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t"
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,8,4\n\t"
"vleib %%v1,9,5\n\t"
"vleib %%v1,10,6\n\t"
"vleib %%v1,11,7\n\t"
"vleib %%v1,16,8\n\t"
"vleib %%v1,17,9\n\t"
"vleib %%v1,18,10\n\t"
"vleib %%v1,19,11\n\t"
"vleib %%v1,24,12\n\t"
"vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%[x])\n\t"
"vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%[x])\n\t"
"vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%[x])\n\t"
"vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%[x])\n\t"
"vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t"
"vflpsb %%v19,%%v19\n\t"
"vflpsb %%v20,%%v20\n\t"
"vflpsb %%v21,%%v21\n\t"
"vflpsb %%v22,%%v22\n\t"
"vflpsb %%v23,%%v23\n\t"
"vflpsb %%v24,%%v24\n\t"
"vflpsb %%v25,%%v25\n\t"
"vflpsb %%v26,%%v26\n\t"
"vflpsb %%v27,%%v27\n\t"
"vflpsb %%v28,%%v28\n\t"
"vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t"
"vfasb %%v22,%%v22,%%v23\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t"
"vfminsb %%v16,%%v16,%%v24,0\n\t"
"vfminsb %%v18,%%v18,%%v26,0\n\t"
"vfminsb %%v20,%%v20,%%v28,0\n\t"
"vfminsb %%v22,%%v22,%%v30,0\n\t"
"vfminsb %%v16,%%v16,%%v20,0\n\t"
"vfminsb %%v18,%%v18,%%v22,0\n\t"
"vfminsb %%v16,%%v16,%%v18,0\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,0\n\t"
"ler %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
__asm__ volatile ( return amin;
"vlef %%v0,0(%2),0 \n\t"
"vlef %%v16,4(%2),0 \n\t"
"vlef %%v0,8(%2),1 \n\t"
"vlef %%v16,12(%2),1 \n\t"
"vlef %%v0,16(%2),2 \n\t"
"vlef %%v16,20(%2),2 \n\t"
"vlef %%v0,24(%2),3 \n\t"
"vlef %%v16,28(%2),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vleib %%v1,0,0 \n\t"
"vleib %%v1,1,1 \n\t"
"vleib %%v1,2,2 \n\t"
"vleib %%v1,3,3 \n\t"
"vleib %%v1,8,4 \n\t"
"vleib %%v1,9,5 \n\t"
"vleib %%v1,10,6 \n\t"
"vleib %%v1,11,7 \n\t"
"vleib %%v1,16,8 \n\t"
"vleib %%v1,17,9 \n\t"
"vleib %%v1,18,10 \n\t"
"vleib %%v1,19,11 \n\t"
"vleib %%v1,24,12 \n\t"
"vleib %%v1,25,13 \n\t"
"vleib %%v1,26,14 \n\t"
"vleib %%v1,27,15 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v2,16(%%r1,%2) \n\t"
"vpkg %%v17,%%v16,%%v2 \n\t"
"vperm %%v16,%%v16,%%v2,%%v1 \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v2,48(%%r1,%2) \n\t"
"vpkg %%v19,%%v18,%%v2 \n\t"
"vperm %%v18,%%v18,%%v2,%%v1 \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v2,80(%%r1,%2) \n\t"
"vpkg %%v21,%%v20,%%v2 \n\t"
"vperm %%v20,%%v20,%%v2,%%v1 \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v2,112(%%r1,%2) \n\t"
"vpkg %%v23,%%v22,%%v2 \n\t"
"vperm %%v22,%%v22,%%v2,%%v1 \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v2,144(%%r1,%2) \n\t"
"vpkg %%v25,%%v24,%%v2 \n\t"
"vperm %%v24,%%v24,%%v2,%%v1 \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v2,176(%%r1,%2) \n\t"
"vpkg %%v27,%%v26,%%v2 \n\t"
"vperm %%v26,%%v26,%%v2,%%v1 \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v2,208(%%r1,%2) \n\t"
"vpkg %%v29,%%v28,%%v2 \n\t"
"vperm %%v28,%%v28,%%v2,%%v1 \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v2,240(%%r1,%2) \n\t"
"vpkg %%v31,%%v30,%%v2 \n\t"
"vperm %%v30,%%v30,%%v2,%%v1 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vflpsb %%v17,%%v17 \n\t"
"vflpsb %%v18,%%v18 \n\t"
"vflpsb %%v19,%%v19 \n\t"
"vflpsb %%v20,%%v20 \n\t"
"vflpsb %%v21,%%v21 \n\t"
"vflpsb %%v22,%%v22 \n\t"
"vflpsb %%v23,%%v23 \n\t"
"vflpsb %%v24,%%v24 \n\t"
"vflpsb %%v25,%%v25 \n\t"
"vflpsb %%v26,%%v26 \n\t"
"vflpsb %%v27,%%v27 \n\t"
"vflpsb %%v28,%%v28 \n\t"
"vflpsb %%v29,%%v29 \n\t"
"vflpsb %%v30,%%v30 \n\t"
"vflpsb %%v31,%%v31 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v18,%%v18,%%v19 \n\t"
"vfasb %%v20,%%v20,%%v21 \n\t"
"vfasb %%v22,%%v22,%%v23 \n\t"
"vfasb %%v24,%%v24,%%v25 \n\t"
"vfasb %%v26,%%v26,%%v27 \n\t"
"vfasb %%v28,%%v28,%%v29 \n\t"
"vfasb %%v30,%%v30,%%v31 \n\t"
"vfminsb %%v16,%%v16,%%v24,0 \n\t"
"vfminsb %%v18,%%v18,%%v26,0 \n\t"
"vfminsb %%v20,%%v20,%%v28,0 \n\t"
"vfminsb %%v22,%%v22,%%v30,0 \n\t"
"vfminsb %%v16,%%v16,%%v20,0 \n\t"
"vfminsb %%v18,%%v18,%%v22,0 \n\t"
"vfminsb %%v16,%%v16,%%v18,0 \n\t"
"vfminsb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfminsb %%v0,%%v0,%%v16,0 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfminsb %%v0,%%v0,%%v16,0 \n\t"
"ler %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amin;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0; BLASLONG ix = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
minf = camin_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
minf=CABS1(x,0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (minf);
minf = camin_kernel_32(n1, x);
ix = n1 * 2;
i = n1;
} else { } else {
minf = CABS1(x, 0);
minf=CABS1(x,0); ix += 2;
inc_x2 = 2 * inc_x; i++;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) < minf) {
minf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) < minf) {
minf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) < minf) {
minf = CABS1(x,ix+inc_x2*3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (minf);
} }
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (minf);
} else {
minf = CABS1(x, 0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) < minf) {
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) < minf) {
minf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) < minf) {
minf = CABS1(x, ix + inc_x2 * 3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += inc_x2;
i++;
}
return (minf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,140 +28,128 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif
static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) {
{ FLOAT asum;
FLOAT asum;
__asm__ ( __asm__("vzero %%v24\n\t"
"vzero %%v0 \n\t" "vzero %%v25\n\t"
"vzero %%v1 \n\t" "vzero %%v26\n\t"
"vzero %%v2 \n\t" "vzero %%v27\n\t"
"vzero %%v3 \n\t" "vzero %%v28\n\t"
"srlg %%r0,%1,5 \n\t" "vzero %%v29\n\t"
"xgr %%r1,%%r1 \n\t" "vzero %%v30\n\t"
"0: \n\t" "vzero %%v31\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "srlg %[n],%[n],5\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "xgr %%r1,%%r1\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "0:\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%2) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vflpsb %%v16, %%v16 \n\t" return asum;
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v2 \n\t"
"vfasb %%v0,%%v0,%%v3 \n\t"
"veslg %%v1,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"ler %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
{ BLASLONG i = 0;
BLASLONG i=0; BLASLONG ip = 0;
BLASLONG ip=0; FLOAT sumf = 0.0;
FLOAT sumf = 0.0; BLASLONG n1;
BLASLONG n1; BLASLONG inc_x2;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf); if (n <= 0 || inc_x <= 0)
return (sumf);
if ( inc_x == 1 ) if (inc_x == 1) {
{
n1 = n & -32; n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{
sumf = casum_kernel_32(n1, x);
i=n1;
ip=2*n1;
}
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
i++;
ip+=2;
}
sumf = casum_kernel_32(n1, x);
i = n1;
ip = 2 * n1;
} }
else
{
inc_x2 = 2* inc_x;
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
ip+=inc_x2;
i++;
}
while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
i++;
ip += 2;
} }
return(sumf);
} else {
inc_x2 = 2 * inc_x;
while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
ip += inc_x2;
i++;
}
}
return (sumf);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,148 +27,140 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
{ __asm__(
__asm__ volatile(
#if !defined(CONJ) #if !defined(CONJ)
"vlrepf %%v0,0(%3) \n\t" "vlrepf %%v0,0(%[alpha])\n\t"
"vlef %%v1,4(%3),0 \n\t" "vlef %%v1,4(%[alpha]),0\n\t"
"vlef %%v1,4(%3),2 \n\t" "vlef %%v1,4(%[alpha]),2\n\t"
"vflcsb %%v1,%%v1 \n\t" "vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%3),1 \n\t" "vlef %%v1,4(%[alpha]),1\n\t"
"vlef %%v1,4(%3),3 \n\t" "vlef %%v1,4(%[alpha]),3\n\t"
#else #else
"vlef %%v0,0(%3),1 \n\t" "vlef %%v0,0(%[alpha]),1\n\t"
"vlef %%v0,0(%3),3 \n\t" "vlef %%v0,0(%[alpha]),3\n\t"
"vflcsb %%v0,%%v0 \n\t" "vflcsb %%v0,%%v0\n\t"
"vlef %%v0,0(%3),0 \n\t" "vlef %%v0,0(%[alpha]),0\n\t"
"vlef %%v0,0(%3),2 \n\t" "vlef %%v0,0(%[alpha]),2\n\t"
"vlrepf %%v1,4(%3) \n\t" "vlrepf %%v1,4(%[alpha])\n\t"
#endif #endif
"srlg %%r0,%0,4 \n\t" "srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1 \n\t" "xgr %%r1,%%r1\n\t"
"0: \n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v8,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v9,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%1) \n\t" "vl %%v10,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%1) \n\t" "vl %%v11,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%1) \n\t" "vl %%v12,0(%%r1,%[y])\n\t"
"vl %%v20,0(%%r1,%2) \n\t" "vl %%v13,16(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%2) \n\t" "vl %%v14,32(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%2) \n\t" "vl %%v15,48(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%2) \n\t" "vl %%v16,64(%%r1,%[x])\n\t"
"verllg %%v24,%%v16,32 \n\t" "vl %%v17,80(%%r1,%[x])\n\t"
"verllg %%v25,%%v17,32 \n\t" "vl %%v18,96(%%r1,%[x])\n\t"
"verllg %%v26,%%v18,32 \n\t" "vl %%v19,112(%%r1,%[x])\n\t"
"verllg %%v27,%%v19,32 \n\t" "vl %%v20,64(%%r1,%[y])\n\t"
"vl %%v21,80(%%r1,%[y])\n\t"
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" "vl %%v22,96(%%r1,%[y])\n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" "vl %%v23,112(%%r1,%[y])\n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" "verllg %%v24,%%v8,32\n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" "verllg %%v25,%%v9,32\n\t"
"verllg %%v26,%%v10,32\n\t"
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" "verllg %%v27,%%v11,32\n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" "verllg %%v28,%%v16,32\n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" "verllg %%v29,%%v17,32\n\t"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" "verllg %%v30,%%v18,32\n\t"
"verllg %%v31,%%v19,32\n\t"
"vst %%v28,0(%%r1,%2) \n\t" "vfmasb %%v8,%%v8,%%v0,%%v12\n\t"
"vst %%v29,16(%%r1,%2) \n\t" "vfmasb %%v9,%%v9,%%v0,%%v13\n\t"
"vst %%v30,32(%%r1,%2) \n\t" "vfmasb %%v10,%%v10,%%v0,%%v14\n\t"
"vst %%v31,48(%%r1,%2) \n\t" "vfmasb %%v11,%%v11,%%v0,%%v15\n\t"
"vfmasb %%v16,%%v16,%%v0,%%v20\n\t"
"vl %%v16,64(%%r1,%1) \n\t" "vfmasb %%v17,%%v17,%%v0,%%v21\n\t"
"vl %%v17,80(%%r1,%1) \n\t" "vfmasb %%v18,%%v18,%%v0,%%v22\n\t"
"vl %%v18,96(%%r1,%1) \n\t" "vfmasb %%v19,%%v19,%%v0,%%v23\n\t"
"vl %%v19,112(%%r1,%1) \n\t" "vfmasb %%v8,%%v24,%%v1,%%v8\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vfmasb %%v9,%%v25,%%v1,%%v9\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vfmasb %%v10,%%v26,%%v1,%%v10\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vfmasb %%v11,%%v27,%%v1,%%v11\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vfmasb %%v16,%%v28,%%v1,%%v16\n\t"
"verllg %%v24,%%v16,32 \n\t" "vfmasb %%v17,%%v29,%%v1,%%v17\n\t"
"verllg %%v25,%%v17,32 \n\t" "vfmasb %%v18,%%v30,%%v1,%%v18\n\t"
"verllg %%v26,%%v18,32 \n\t" "vfmasb %%v19,%%v31,%%v1,%%v19\n\t"
"verllg %%v27,%%v19,32 \n\t" "vst %%v8,0(%%r1,%[y])\n\t"
"vst %%v9,16(%%r1,%[y])\n\t"
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" "vst %%v10,32(%%r1,%[y])\n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" "vst %%v11,48(%%r1,%[y])\n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" "vst %%v16,64(%%r1,%[y])\n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" "vst %%v17,80(%%r1,%[y])\n\t"
"vst %%v18,96(%%r1,%[y])\n\t"
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" "vst %%v19,112(%%r1,%[y])\n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" "agfi %%r1,128\n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" "brctg %[n],0b"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"vst %%v28,64(%%r1,%2) \n\t" "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
"vst %%v29,80(%%r1,%2) \n\t" : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
"vst %%v30,96(%%r1,%2) \n\t" "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"vst %%v31,112(%%r1,%2) \n\t" "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG i = 0; FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG ix = 0, iy = 0; BLASLONG dummy2) {
FLOAT da[2] __attribute__ ((aligned(16))); BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2] __attribute__ ((aligned(16)));
if (n <= 0) return (0); if (n <= 0)
return (0);
if ((inc_x == 1) && (inc_y == 1)) { if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
if (n1) {
da[0] = da_r;
da[1] = da_i;
caxpy_kernel_16(n1, x, y, da);
ix = 2 * n1;
}
i = n1;
while (i < n) {
#if !defined(CONJ)
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
i++;
ix += 2;
}
return (0);
BLASLONG n1 = n & -16;
if (n1) {
da[0] = da_r;
da[1] = da_i;
caxpy_kernel_16(n1, x, y, da);
ix = 2 * n1;
} }
i = n1;
inc_x *= 2;
inc_y *= 2;
while (i < n) { while (i < n) {
#if !defined(CONJ) #if !defined(CONJ)
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else #else
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif #endif
ix += inc_x; i++;
iy += inc_y; ix += 2;
i++;
} }
return (0); return (0);
}
inc_x *= 2;
inc_y *= 2;
while (i < n) {
#if !defined(CONJ)
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
ix += inc_x;
iy += inc_y;
i++;
}
return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,73 +27,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],5\n\t"
__asm__ volatile ( "0:\n\t"
"lgr %%r1,%1 \n\t" "pfd 1, 1024(%[x])\n\t"
"lgr %%r2,%2 \n\t" "pfd 2, 1024(%[y])\n\t"
"srlg %%r0,%0,5 \n\t" "mvc 0(256,%[y]),0(%[x])\n\t"
"0: \n\t" "la %[x],256(%[x])\n\t"
"pfd 1, 1024(%%r1) \n\t" "la %[y],256(%[y])\n\t"
"pfd 2, 1024(%%r2) \n\t" "brctg %[n],0b"
"mvc 0(256,%%r2),0(%%r1) \n\t" : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
"agfi %%r1,256 \n\t" [n] "+&r"(n)
"agfi %%r2,256 \n\t" : "m"(*(const struct { FLOAT x[n * 2]; } *) x)
"brctg %%r0,0b " : "cc");
:
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","r2"
);
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
{ BLASLONG i = 0;
BLASLONG i=0; BLASLONG ix = 0, iy = 0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 )) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{ ccopy_kernel_32(n1, x, y);
ccopy_kernel_32(n1, x, y); i = n1;
i=n1; ix = n1 * 2;
ix=n1*2; iy = n1 * 2;
iy=n1*2; }
}
while(i < n)
{
y[iy] = x[iy] ;
y[iy+1] = x[ix+1] ;
ix+=2;
iy+=2;
i++ ;
}
while (i < n) {
y[iy] = x[iy];
y[iy + 1] = x[ix + 1];
ix += 2;
iy += 2;
i++;
} }
else
{
BLASLONG inc_x2 = 2 * inc_x; } else {
BLASLONG inc_y2 = 2 * inc_y;
while(i < n) BLASLONG inc_x2 = 2 * inc_x;
{ BLASLONG inc_y2 = 2 * inc_y;
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
} while (i < n) {
y[iy] = x[ix];
y[iy + 1] = x[ix + 1];
ix += inc_x2;
iy += inc_y2;
i++;
} }
return(0); }
return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,156 +27,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
{ __asm__("vzero %%v24\n\t"
__asm__ volatile( "vzero %%v25\n\t"
"vzero %%v24 \n\t" "vzero %%v26\n\t"
"vzero %%v25 \n\t" "vzero %%v27\n\t"
"vzero %%v26 \n\t" "vzero %%v28\n\t"
"vzero %%v27 \n\t" "vzero %%v29\n\t"
"vzero %%v28 \n\t" "vzero %%v30\n\t"
"vzero %%v29 \n\t" "vzero %%v31\n\t"
"vzero %%v30 \n\t" "srlg %[n],%[n],4\n\t"
"vzero %%v31 \n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,4 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[y])\n\t"
"pfd 1, 1024(%%r1,%1) \n\t" "vl %%v16, 0(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v0, 0(%%r1,%2) \n\t" "vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%2) \n\t" "verllg %%v20,%%v16,32\n\t"
"vl %%v2, 32(%%r1,%2) \n\t" "verllg %%v21,%%v17,32\n\t"
"vl %%v3, 48(%%r1,%2) \n\t" "verllg %%v22,%%v18,32\n\t"
"verllg %%v20,%%v16,32 \n\t" "verllg %%v23,%%v19,32\n\t"
"verllg %%v21,%%v17,32 \n\t" "vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"verllg %%v22,%%v18,32 \n\t" "vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"verllg %%v23,%%v19,32 \n\t" "vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" "vfmasb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" "vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" "vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" "vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" "vl %%v16, 64(%%r1,%[x])\n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" "vl %%v17, 80(%%r1,%[x])\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" "vl %%v18, 96(%%r1,%[x])\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" "vl %%v19, 112(%%r1,%[x])\n\t"
"vl %%v0, 64(%%r1,%[y])\n\t"
"vl %%v16, 64(%%r1,%1) \n\t" "vl %%v1, 80(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%1) \n\t" "vl %%v2, 96(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%1) \n\t" "vl %%v3, 112(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%1) \n\t" "verllg %%v20,%%v16,32\n\t"
"vl %%v0, 64(%%r1,%2) \n\t" "verllg %%v21,%%v17,32\n\t"
"vl %%v1, 80(%%r1,%2) \n\t" "verllg %%v22,%%v18,32\n\t"
"vl %%v2, 96(%%r1,%2) \n\t" "verllg %%v23,%%v19,32\n\t"
"vl %%v3, 112(%%r1,%2) \n\t" "vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"verllg %%v20,%%v16,32 \n\t" "vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"verllg %%v21,%%v17,32 \n\t" "vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
"verllg %%v22,%%v18,32 \n\t" "vfmasb %%v27,%%v21,%%v1,%%v27\n\t"
"verllg %%v23,%%v19,32 \n\t" "vfmasb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" "vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" "vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" "agfi %%r1,128\n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" "brctg %[n],0b\n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" "vfasb %%v24,%%v24,%%v26\n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" "vfasb %%v24,%%v24,%%v28\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" "vfasb %%v24,%%v24,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" "vrepg %%v26,%%v24,1\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"agfi %%r1,128 \n\t" "vfasb %%v25,%%v25,%%v27\n\t"
"brctg %%r0,0b \n\t" "vfasb %%v25,%%v25,%%v29\n\t"
"vfasb %%v24,%%v24,%%v26 \n\t" "vfasb %%v25,%%v25,%%v31\n\t"
"vfasb %%v24,%%v24,%%v28 \n\t" "vrepg %%v27,%%v25,1\n\t"
"vfasb %%v24,%%v24,%%v30 \n\t" "vfasb %%v25,%%v25,%%v27\n\t"
"vrepg %%v26,%%v24,1 \n\t" "vstef %%v24,0(%[d]),0\n\t"
"vfasb %%v24,%%v24,%%v26 \n\t" "vstef %%v24,4(%[d]),1\n\t"
"vfasb %%v25,%%v25,%%v27 \n\t" "vstef %%v25,8(%[d]),1\n\t"
"vfasb %%v25,%%v25,%%v29 \n\t" "vstef %%v25,12(%[d]),0"
"vfasb %%v25,%%v25,%%v31 \n\t" : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
"vrepg %%v27,%%v25,1 \n\t" : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"vfasb %%v25,%%v25,%%v27 \n\t" "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
"vstef %%v24,0(%3),0 \n\t" : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"vstef %%v24,4(%3),1 \n\t" "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"vstef %%v25,8(%3),1 \n\t" "v31");
"vstef %%v25,12(%3),0 "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG i; BLASLONG inc_y) {
BLASLONG ix, iy; BLASLONG i;
OPENBLAS_COMPLEX_FLOAT result; BLASLONG ix, iy;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {
0.0, 0.0, 0.0, 0.0};
if (n <= 0) { if (n <= 0) {
CREAL(result) = 0.0; CREAL(result) = 0.0;
CIMAG(result) = 0.0; CIMAG(result) = 0.0;
return (result); return (result);
}
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -16;
if (n1)
cdot_kernel_16(n1, x, y, dot);
i = n1;
BLASLONG j = i * 2;
while (i < n) {
dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];
j += 2;
i++;
} }
if ((inc_x == 1) && (inc_y == 1)) { } else {
i = 0;
ix = 0;
iy = 0;
inc_x <<= 1;
inc_y <<= 1;
while (i < n) {
BLASLONG n1 = n & -16; dot[0] += x[ix] * y[iy];
dot[1] += x[ix + 1] * y[iy + 1];
dot[2] += x[ix] * y[iy + 1];
dot[3] += x[ix + 1] * y[iy];
if (n1) ix += inc_x;
cdot_kernel_16(n1, x, y, dot); iy += inc_y;
i++;
i = n1;
BLASLONG j = i * 2;
while (i < n) {
dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];
j += 2;
i++;
}
} else {
i = 0;
ix = 0;
iy = 0;
inc_x <<= 1;
inc_y <<= 1;
while (i < n) {
dot[0] += x[ix] * y[iy];
dot[1] += x[ix + 1] * y[iy + 1];
dot[2] += x[ix] * y[iy + 1];
dot[3] += x[ix + 1] * y[iy];
ix += inc_x;
iy += inc_y;
i++;
}
} }
}
#if !defined(CONJ) #if !defined(CONJ)
CREAL(result) = dot[0] - dot[1]; CREAL(result) = dot[0] - dot[1];
CIMAG(result) = dot[2] + dot[3]; CIMAG(result) = dot[2] + dot[3];
#else #else
CREAL(result) = dot[0] + dot[1]; CREAL(result) = dot[0] + dot[1];
CIMAG(result) = dot[2] - dot[3]; CIMAG(result) = dot[2] - dot[3];
#endif #endif
return (result); return (result);
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,230 +27,210 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
{ __asm__("vlrepf %%v0,%[c]\n\t"
__asm__ ( "vlrepf %%v1,%[s]\n\t"
"vlrepf %%v0,%3 \n\t" "srlg %[n],%[n],5\n\t"
"vlrepf %%v1,%4 \n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,5 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"0: \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "vl %%v24, 0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v24, 0(%%r1,%1) \n\t" "vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%1) \n\t" "vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%1) \n\t" "vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v27, 48(%%r1,%1) \n\t" "vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v28,%%v24,%%v0 \n\t" "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v29,%%v25,%%v0 \n\t" "vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v30,%%v26,%%v0 \n\t" "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ /* 2nd parts */
"vfmsb %%v31,%%v27,%%v0 \n\t" "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v28, 0(%%r1,%[x])\n\t"
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v29, 16(%%r1,%[x])\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v28, 0(%%r1,%1) \n\t" "vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v29, 16(%%r1,%1) \n\t" "vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v30, 32(%%r1,%1) \n\t" "vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v31, 48(%%r1,%1) \n\t" "vst %%v23, 48(%%r1,%[y])\n\t"
"vst %%v20, 0(%%r1,%2) \n\t" "vl %%v24, 64(%%r1,%[x])\n\t"
"vst %%v21, 16(%%r1,%2) \n\t" "vl %%v25, 80(%%r1,%[x])\n\t"
"vst %%v22, 32(%%r1,%2) \n\t" "vl %%v26, 96(%%r1,%[x])\n\t"
"vst %%v23, 48(%%r1,%2) \n\t" "vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%1) \n\t" "vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v25, 80(%%r1,%1) \n\t" "vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v26, 96(%%r1,%1) \n\t" "vl %%v19, 112(%%r1,%[y])\n\t"
"vl %%v27, 112(%%r1,%1) \n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vl %%v16, 64(%%r1,%2) \n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vl %%v17, 80(%%r1,%2) \n\t" "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vl %%v18, 96(%%r1,%2) \n\t" "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vl %%v19, 112(%%r1,%2) \n\t" "vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v28,%%v24,%%v0 \n\t" "vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t" "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ /* 2nd parts */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsb %%v30,%%v26,%%v0 \n\t" "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsb %%v31,%%v27,%%v0 \n\t" "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
/* 2nd parts*/ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" "vst %%v28, 64(%%r1,%[x])\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vst %%v29, 80(%%r1,%[x])\n\t"
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v30, 96(%%r1,%[x])\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v31, 112(%%r1,%[x])\n\t"
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v28, 64(%%r1,%1) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v29, 80(%%r1,%1) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%1) \n\t" "vl %%v25, 144(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vl %%v16, 128(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vl %%v17, 144(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vl %%v27, 176(%%r1,%1) \n\t" "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vl %%v16, 128(%%r1,%2) \n\t" "vfmsb %%v30,%%v26,%%v0\n\t"
"vl %%v17, 144(%%r1,%2) \n\t" "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vl %%v18, 160(%%r1,%2) \n\t" "vfmsb %%v31,%%v27,%%v0\n\t"
"vl %%v19, 176(%%r1,%2) \n\t" "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmsb %%v28,%%v24,%%v0 \n\t" "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t" "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmsb %%v30,%%v26,%%v0 \n\t" "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmsb %%v31,%%v27,%%v0 \n\t" "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vst %%v28, 128(%%r1,%[x])\n\t"
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" "vst %%v29, 144(%%r1,%[x])\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vst %%v30, 160(%%r1,%[x])\n\t"
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" "vst %%v31, 176(%%r1,%[x])\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vst %%v20, 128(%%r1,%[y])\n\t"
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v21, 144(%%r1,%[y])\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v22, 160(%%r1,%[y])\n\t"
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v23, 176(%%r1,%[y])\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vst %%v28, 128(%%r1,%1) \n\t" "vl %%v26, 224(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%1) \n\t" "vl %%v27, 240(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%1) \n\t" "vl %%v16, 192(%%r1,%[y])\n\t"
"vst %%v31, 176(%%r1,%1) \n\t" "vl %%v17, 208(%%r1,%[y])\n\t"
"vst %%v20, 128(%%r1,%2) \n\t" "vl %%v18, 224(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%2) \n\t" "vl %%v19, 240(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%2) \n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vst %%v23, 176(%%r1,%2) \n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vl %%v24, 192(%%r1,%1) \n\t" "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vl %%v25, 208(%%r1,%1) \n\t" "vfmsb %%v30,%%v26,%%v0\n\t"
"vl %%v26, 224(%%r1,%1) \n\t" "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vl %%v27, 240(%%r1,%1) \n\t" "vfmsb %%v31,%%v27,%%v0\n\t"
"vl %%v16, 192(%%r1,%2) \n\t" "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
"vl %%v17, 208(%%r1,%2) \n\t" /* 2nd parts */
"vl %%v18, 224(%%r1,%2) \n\t" "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vl %%v19, 240(%%r1,%2) \n\t" "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t" "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmsb %%v29,%%v25,%%v0 \n\t" "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsb %%v30,%%v26,%%v0 \n\t" "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vst %%v28, 192(%%r1,%[x])\n\t"
"vfmsb %%v31,%%v27,%%v0 \n\t" "vst %%v29, 208(%%r1,%[x])\n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vst %%v30, 224(%%r1,%[x])\n\t"
/* 2nd parts*/ "vst %%v31, 240(%%r1,%[x])\n\t"
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" "vst %%v20, 192(%%r1,%[y])\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vst %%v21, 208(%%r1,%[y])\n\t"
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" "vst %%v22, 224(%%r1,%[y])\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vst %%v23, 240(%%r1,%[y])\n\t"
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" "agfi %%r1,256\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "brctg %[n],0b"
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" : "+m"(*(struct { FLOAT x[n * 2]; } *) x),
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
"vst %%v28, 192(%%r1,%1) \n\t" : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"vst %%v29, 208(%%r1,%1) \n\t" "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"vst %%v30, 224(%%r1,%1) \n\t" "v31");
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
{ FLOAT c, FLOAT s) {
BLASLONG i=0; BLASLONG i = 0;
BLASLONG ix=0,iy=0; BLASLONG ix = 0, iy = 0;
FLOAT temp[2]; FLOAT temp[2];
BLASLONG inc_x2; BLASLONG inc_x2;
BLASLONG inc_y2; BLASLONG inc_y2;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{ FLOAT cosa, sina;
FLOAT cosa,sina; cosa = c;
cosa=c; sina = s;
sina=s; crot_kernel_32(n1, x, y, &cosa, &sina);
crot_kernel_32(n1, x, y, &cosa, &sina); i = n1;
i=n1; ix = 2 * n1;
ix=2*n1; }
}
while(i < n) while (i < n) {
{ temp[0] = c * x[ix] + s * y[ix];
temp[0] = c*x[ix] + s*y[ix] ; temp[1] = c * x[ix + 1] + s * y[ix + 1];
temp[1] = c*x[ix+1] + s*y[ix+1] ; y[ix] = c * y[ix] - s * x[ix];
y[ix] = c*y[ix] - s*x[ix] ; y[ix + 1] = c * y[ix + 1] - s * x[ix + 1];
y[ix+1] = c*y[ix+1] - s*x[ix+1] ; x[ix] = temp[0];
x[ix] = temp[0] ; x[ix + 1] = temp[1];
x[ix+1] = temp[1] ;
ix += 2 ;
i++ ;
}
ix += 2;
i++;
} }
else
{
inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;
while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;
ix += inc_x2 ; } else {
iy += inc_y2 ; inc_x2 = 2 * inc_x;
i++ ; inc_y2 = 2 * inc_y;
while (i < n) {
temp[0] = c * x[ix] + s * y[iy];
temp[1] = c * x[ix + 1] + s * y[iy + 1];
y[iy] = c * y[iy] - s * x[ix];
y[iy + 1] = c * y[iy + 1] - s * x[ix + 1];
x[ix] = temp[0];
x[ix + 1] = temp[1];
} ix += inc_x2;
iy += inc_y2;
i++;
} }
return(0);
}
return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013 - 2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,430 +27,403 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) {
{ __asm__("vlrepf %%v0,0(%[alpha])\n\t"
__asm__ volatile( "vlef %%v1,4(%[alpha]),0\n\t"
"vlrepf %%v0,0(%1) \n\t" "vlef %%v1,4(%[alpha]),2\n\t"
"vlef %%v1,4(%1),0 \n\t" "vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%1),2 \n\t" "vlef %%v1,4(%[alpha]),1\n\t"
"vflcsb %%v1,%%v1 \n\t" "vlef %%v1,4(%[alpha]),3\n\t"
"vlef %%v1,4(%1),1 \n\t" "srlg %[n],%[n],4\n\t"
"vlef %%v1,4(%1),3 \n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,4 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"0: \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "verllg %%v24,%%v16,32\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "verllg %%v25,%%v17,32\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "verllg %%v26,%%v18,32\n\t"
"verllg %%v24,%%v16,32 \n\t" "verllg %%v27,%%v19,32\n\t"
"verllg %%v25,%%v17,32 \n\t" "verllg %%v28,%%v20,32\n\t"
"verllg %%v26,%%v18,32 \n\t" "verllg %%v29,%%v21,32\n\t"
"verllg %%v27,%%v19,32 \n\t" "verllg %%v30,%%v22,32\n\t"
"verllg %%v28,%%v20,32 \n\t" "verllg %%v31,%%v23,32\n\t"
"verllg %%v29,%%v21,32 \n\t" "vfmsb %%v16,%%v16,%%v0\n\t"
"verllg %%v30,%%v22,32 \n\t" "vfmsb %%v17,%%v17,%%v0\n\t"
"verllg %%v31,%%v23,32 \n\t" "vfmsb %%v18,%%v18,%%v0\n\t"
"vfmsb %%v19,%%v19,%%v0\n\t"
"vfmsb %%v16,%%v16,%%v0 \n\t" "vfmsb %%v20,%%v20,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t" "vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t" "vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t" "vfmsb %%v23,%%v23,%%v0\n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t" "vfmasb %%v16,%%v24,%%v1,%%v16\n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t" "vfmasb %%v17,%%v25,%%v1,%%v17\n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t" "vfmasb %%v18,%%v26,%%v1,%%v18\n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t" "vfmasb %%v19,%%v27,%%v1,%%v19\n\t"
"vfmasb %%v16,%%v24,%%v1,%%v16 \n\t" "vfmasb %%v20,%%v28,%%v1,%%v20\n\t"
"vfmasb %%v17,%%v25,%%v1,%%v17 \n\t" "vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmasb %%v18,%%v26,%%v1,%%v18 \n\t" "vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmasb %%v19,%%v27,%%v1,%%v19 \n\t" "vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
"vfmasb %%v20,%%v28,%%v1,%%v20 \n\t" "vst %%v16,0(%%r1,%[x])\n\t"
"vfmasb %%v21,%%v29,%%v1,%%v21 \n\t" "vst %%v17,16(%%r1,%[x])\n\t"
"vfmasb %%v22,%%v30,%%v1,%%v22 \n\t" "vst %%v18,32(%%r1,%[x])\n\t"
"vfmasb %%v23,%%v31,%%v1,%%v23 \n\t" "vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vst %%v23,112(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "agfi %%r1,128\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "brctg %[n],0b"
"vst %%v21,80(%%r1,%2) \n\t" : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
"vst %%v22,96(%%r1,%2) \n\t" : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
"vst %%v23,112(%%r1,%2) \n\t" [alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"agfi %%r1,128 \n\t" "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"brctg %%r0,0b " "v31");
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlef %%v0,4(%1),0 \n\t"
"vlef %%v0,4(%1),2 \n\t"
"vflcsb %%v0,%%v0 \n\t"
"vlef %%v0,4(%1),1 \n\t"
"vlef %%v0,4(%1),3 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v16,%%v16,32 \n\t"
"verllg %%v17,%%v17,32 \n\t"
"verllg %%v18,%%v18,32 \n\t"
"verllg %%v19,%%v19,32 \n\t"
"verllg %%v20,%%v20,32 \n\t"
"verllg %%v21,%%v21,32 \n\t"
"verllg %%v22,%%v22,32 \n\t"
"verllg %%v23,%%v23,32 \n\t"
"vfmsb %%v16,%%v16,%%v0 \n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
} }
static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
{ __asm__("vlef %%v0,4(%[alpha]),0\n\t"
__asm__ volatile( "vlef %%v0,4(%[alpha]),2\n\t"
"vlrepf %%v0,0(%1) \n\t" "vflcsb %%v0,%%v0\n\t"
"srlg %%r0,%0,4 \n\t" "vlef %%v0,4(%[alpha]),1\n\t"
"xgr %%r1,%%r1 \n\t" "vlef %%v0,4(%[alpha]),3\n\t"
"0: \n\t" "srlg %[n],%[n],4\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfmsb %%v16,%%v16,%%v0 \n\t" "verllg %%v16,%%v16,32\n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t" "verllg %%v17,%%v17,32\n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t" "verllg %%v18,%%v18,32\n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t" "verllg %%v19,%%v19,32\n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t" "verllg %%v20,%%v20,32\n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t" "verllg %%v21,%%v21,32\n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t" "verllg %%v22,%%v22,32\n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t" "verllg %%v23,%%v23,32\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vfmsb %%v17,%%v17,%%v0\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vfmsb %%v18,%%v18,%%v0\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vfmsb %%v19,%%v19,%%v0\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "vfmsb %%v20,%%v20,%%v0\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "vfmsb %%v21,%%v21,%%v0\n\t"
"vst %%v21,80(%%r1,%2) \n\t" "vfmsb %%v22,%%v22,%%v0\n\t"
"vst %%v22,96(%%r1,%2) \n\t" "vfmsb %%v23,%%v23,%%v0\n\t"
"vst %%v23,112(%%r1,%2) \n\t" "vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%[x])\n\t"
"agfi %%r1,128 \n\t" "vst %%v18,32(%%r1,%[x])\n\t"
"brctg %%r0,0b " "vst %%v19,48(%%r1,%[x])\n\t"
: "vst %%v20,64(%%r1,%[x])\n\t"
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) "vst %%v21,80(%%r1,%[x])\n\t"
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" "vst %%v22,96(%%r1,%[x])\n\t"
); "vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
} }
static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
{ __asm__("vlrepf %%v0,0(%[alpha])\n\t"
__asm__ volatile( "srlg %[n],%[n],4\n\t"
"vzero %%v24 \n\t" "xgr %%r1,%%r1\n\t"
"vzero %%v25 \n\t" "0:\n\t"
"vzero %%v26 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vzero %%v27 \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"srlg %%r0,%0,4 \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"xgr %%r1,%%r1 \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"0: \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vst %%v24,0(%%r1,%1) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vst %%v25,16(%%r1,%1) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vst %%v26,32(%%r1,%1) \n\t" "vfmsb %%v16,%%v16,%%v0\n\t"
"vst %%v27,48(%%r1,%1) \n\t" "vfmsb %%v17,%%v17,%%v0\n\t"
"vst %%v24,64(%%r1,%1) \n\t" "vfmsb %%v18,%%v18,%%v0\n\t"
"vst %%v25,80(%%r1,%1) \n\t" "vfmsb %%v19,%%v19,%%v0\n\t"
"vst %%v26,96(%%r1,%1) \n\t" "vfmsb %%v20,%%v20,%%v0\n\t"
"vst %%v27,112(%%r1,%1) \n\t" "vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t"
"agfi %%r1,128 \n\t" "vfmsb %%v23,%%v23,%%v0\n\t"
"brctg %%r0,0b " "vst %%v16,0(%%r1,%[x])\n\t"
: "vst %%v17,16(%%r1,%[x])\n\t"
:"r"(n),"ZR"((FLOAT (*)[n * 2])x) "vst %%v18,32(%%r1,%[x])\n\t"
:"memory","cc","r0","r1","v24","v25","v26","v27" "vst %%v19,48(%%r1,%[x])\n\t"
); "vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
} }
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
{ __asm__("vzero %%v0\n\t"
BLASLONG i; "srlg %[n],%[n],4\n\t"
BLASLONG inc_x2 = 2 * inc_x; "xgr %%r1,%%r1\n\t"
BLASLONG inc_x3 = inc_x2 + inc_x; "0:\n\t"
FLOAT t0, t1, t2, t3; "pfd 2, 1024(%%r1,%[x])\n\t"
FLOAT da_r = alpha[0]; "vst %%v0,0(%%r1,%[x])\n\t"
FLOAT da_i = alpha[1]; "vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
for (i = 0; i < n; i += 4) "vst %%v0,48(%%r1,%[x])\n\t"
{ "vst %%v0,64(%%r1,%[x])\n\t"
t0 = da_r * x[0] - da_i * x[1]; "vst %%v0,80(%%r1,%[x])\n\t"
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; "vst %%v0,96(%%r1,%[x])\n\t"
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; "vst %%v0,112(%%r1,%[x])\n\t"
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; "agfi %%r1,128\n\t"
"brctg %[n],0b"
x[1] = da_i * x[0] + da_r * x[1]; : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; : [x] "a"(x)
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; : "cc", "r1", "v0");
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];
x[0] = t0;
x[inc_x] = t1;
x[inc_x2] = t2;
x[inc_x3] = t3;
x += 4 * inc_x;
}
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
BLASLONG i = 0, j = 0; BLASLONG inc_x) {
FLOAT temp0; BLASLONG i;
FLOAT temp1; BLASLONG inc_x2 = 2 * inc_x;
FLOAT alpha[2] __attribute__ ((aligned(16))); BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
if (inc_x != 1) { for (i = 0; i < n; i += 4) {
inc_x <<= 1; t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];
if (da_r == 0.0) { x[1] = da_i * x[0] + da_r * x[1];
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];
BLASLONG n1 = n & -2; x[0] = t0;
x[inc_x] = t1;
x[inc_x2] = t2;
x[inc_x3] = t3;
if (da_i == 0.0) { x += 4 * inc_x;
}
}
while (j < n1) { int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
x[i] = 0.0; BLASLONG dummy2) {
x[i + 1] = 0.0; BLASLONG i = 0, j = 0;
x[i + inc_x] = 0.0; FLOAT temp0;
x[i + 1 + inc_x] = 0.0; FLOAT temp1;
i += 2 * inc_x; FLOAT alpha[2] __attribute__ ((aligned(16)));
j += 2;
}
while (j < n) {
x[i] = 0.0;
x[i + 1] = 0.0;
i += inc_x;
j++;
}
} else {
while (j < n1) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
temp1 = -da_i * x[i + 1 + inc_x];
x[i + 1 + inc_x] = da_i * x[i + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
} else {
if (da_i == 0.0) {
BLASLONG n1 = n & -2;
while (j < n1) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
temp1 = da_r * x[i + inc_x];
x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += inc_x;
j++;
}
} else {
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
cscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
}
return (0);
}
BLASLONG n1 = n & -16;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
if (da_r == 0.0)
if (da_i == 0)
cscal_kernel_16_zero(n1, x);
else
cscal_kernel_16_zero_r(n1, alpha, x);
else
if (da_i == 0)
cscal_kernel_16_zero_i(n1, alpha, x);
else
cscal_kernel_16(n1, alpha, x);
i = n1 << 1;
j = n1;
}
if (inc_x != 1) {
inc_x <<= 1;
if (da_r == 0.0) { if (da_r == 0.0) {
if (da_i == 0.0) { BLASLONG n1 = n & -2;
while (j < n) { if (da_i == 0.0) {
x[i] = 0.0; while (j < n1) {
x[i + 1] = 0.0;
i += 2;
j++;
} x[i] = 0.0;
x[i + 1] = 0.0;
} else { x[i + inc_x] = 0.0;
x[i + 1 + inc_x] = 0.0;
while (j < n) { i += 2 * inc_x;
j += 2;
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
} }
while (j < n) {
x[i] = 0.0;
x[i + 1] = 0.0;
i += inc_x;
j++;
}
} else {
while (j < n1) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
temp1 = -da_i * x[i + 1 + inc_x];
x[i + 1 + inc_x] = da_i * x[i + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
} else { } else {
if (da_i == 0.0) { if (da_i == 0.0) {
BLASLONG n1 = n & -2;
while (j < n) { while (j < n1) {
temp0 = da_r * x[i]; temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1]; x[i + 1] = da_r * x[i + 1];
x[i] = temp0; x[i] = temp0;
i += 2; temp1 = da_r * x[i + inc_x];
j++; x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
x[i + inc_x] = temp1;
} i += 2 * inc_x;
j += 2;
} else {
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
} }
while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += inc_x;
j++;
}
} else {
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
cscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
} }
return (0); return (0);
}
BLASLONG n1 = n & -16;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
if (da_r == 0.0)
if (da_i == 0)
cscal_kernel_16_zero(n1, x);
else
cscal_kernel_16_zero_r(n1, alpha, x);
else if (da_i == 0)
cscal_kernel_16_zero_i(n1, alpha, x);
else
cscal_kernel_16(n1, alpha, x);
i = n1 << 1;
j = n1;
}
if (da_r == 0.0) {
if (da_i == 0.0) {
while (j < n) {
x[i] = 0.0;
x[i + 1] = 0.0;
i += 2;
j++;
}
} else {
while (j < n) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
}
} else {
if (da_i == 0.0) {
while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += 2;
j++;
}
} else {
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
}
}
return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,157 +27,143 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],5\n\t"
__asm__ volatile( "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,5 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"0: \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "vl %%v16, 0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%1) \n\t" "vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%1) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%1) \n\t" "vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%1) \n\t" "vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%1) \n\t" "vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%1) \n\t" "vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v30, 224(%%r1,%1) \n\t" "vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v31, 240(%%r1,%1) \n\t" "vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v0, 0(%%r1,%2) \n\t" "vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%2) \n\t" "vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%2) \n\t" "vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%2) \n\t" "vl %%v7, 112(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%2) \n\t" "vst %%v0, 0(%%r1,%[x])\n\t"
"vl %%v5, 80(%%r1,%2) \n\t" "vst %%v1, 16(%%r1,%[x])\n\t"
"vl %%v6, 96(%%r1,%2) \n\t" "vst %%v2, 32(%%r1,%[x])\n\t"
"vl %%v7, 112(%%r1,%2) \n\t" "vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v0, 0(%%r1,%1) \n\t" "vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%1) \n\t" "vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%1) \n\t" "vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%1) \n\t" "vst %%v7, 112(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%1) \n\t" "vl %%v0, 128(%%r1,%[y])\n\t"
"vst %%v5, 80(%%r1,%1) \n\t" "vl %%v1, 144(%%r1,%[y])\n\t"
"vst %%v6, 96(%%r1,%1) \n\t" "vl %%v2, 160(%%r1,%[y])\n\t"
"vst %%v7, 112(%%r1,%1) \n\t" "vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v0, 128(%%r1,%2) \n\t" "vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%2) \n\t" "vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%2) \n\t" "vl %%v7, 240(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%2) \n\t" "vst %%v0, 128(%%r1,%[x])\n\t"
"vl %%v4, 192(%%r1,%2) \n\t" "vst %%v1, 144(%%r1,%[x])\n\t"
"vl %%v5, 208(%%r1,%2) \n\t" "vst %%v2, 160(%%r1,%[x])\n\t"
"vl %%v6, 224(%%r1,%2) \n\t" "vst %%v3, 176(%%r1,%[x])\n\t"
"vl %%v7, 240(%%r1,%2) \n\t" "vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v0, 128(%%r1,%1) \n\t" "vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%1) \n\t" "vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%1) \n\t" "vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%1) \n\t" "vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v4, 192(%%r1,%1) \n\t" "vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v5, 208(%%r1,%1) \n\t" "vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v6, 224(%%r1,%1) \n\t" "vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v7, 240(%%r1,%1) \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v16, 0(%%r1,%2) \n\t" "vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%2) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%2) \n\t" "vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%2) \n\t" "vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%2) \n\t" "vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%2) \n\t" "vst %%v31, 240(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%2) \n\t" "agfi %%r1,256\n\t"
"vst %%v27, 176(%%r1,%2) \n\t" "brctg %[n],0b"
"vst %%v28, 192(%%r1,%2) \n\t" : "+m"(*(struct { FLOAT x[n * 2]; } *) x),
"vst %%v29, 208(%%r1,%2) \n\t" "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
"vst %%v30, 224(%%r1,%2) \n\t" : [x] "a"(x),[y] "a"(y)
"vst %%v31, 240(%%r1,%2) \n\t" : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"agfi %%r1,256 \n\t" "v27", "v28", "v29", "v30", "v31");
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
{ FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
BLASLONG i=0; FLOAT *dummy, BLASLONG dummy2) {
BLASLONG ix=0,iy=0; BLASLONG i = 0;
FLOAT temp[2]; BLASLONG ix = 0, iy = 0;
BLASLONG inc_x2, inc_y2; FLOAT temp[2];
BLASLONG inc_x2, inc_y2;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 )) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{ cswap_kernel_32(n1, x, y);
cswap_kernel_32(n1, x, y); i = n1;
i=n1; ix = 2 * n1;
ix = 2* n1; iy = 2 * n1;
iy = 2* n1; }
}
while(i < n) while (i < n) {
{
temp[0] = x[ix] ; temp[0] = x[ix];
temp[1] = x[ix+1] ; temp[1] = x[ix + 1];
x[ix] = y[iy] ; x[ix] = y[iy];
x[ix+1] = y[iy+1] ; x[ix + 1] = y[iy + 1];
y[iy] = temp[0] ; y[iy] = temp[0];
y[iy+1] = temp[1] ; y[iy + 1] = temp[1];
ix += 2 ;
iy += 2 ;
i++ ;
}
ix += 2;
iy += 2;
i++;
} }
else
{
inc_x2 = 2 * inc_x; } else {
inc_y2 = 2 * inc_y;
while(i < n) inc_x2 = 2 * inc_x;
{ inc_y2 = 2 * inc_y;
temp[0] = x[ix] ; while (i < n) {
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
ix += inc_x2 ; temp[0] = x[ix];
iy += inc_y2 ; temp[1] = x[ix + 1];
i++ ; x[ix] = y[iy];
x[ix + 1] = y[iy + 1];
y[iy] = temp[0];
y[iy + 1] = temp[1];
} ix += inc_x2;
iy += inc_y2;
i++;
} }
return(0);
}
return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
{ FLOAT amax;
FLOAT amax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxdb %%v16,%%v16,%%v24,8\n\t"
"vfmaxdb %%v17,%%v17,%%v25,8\n\t"
"vfmaxdb %%v18,%%v18,%%v26,8\n\t"
"vfmaxdb %%v19,%%v19,%%v27,8\n\t"
"vfmaxdb %%v20,%%v20,%%v28,8\n\t"
"vfmaxdb %%v21,%%v21,%%v29,8\n\t"
"vfmaxdb %%v22,%%v22,%%v30,8\n\t"
"vfmaxdb %%v23,%%v23,%%v31,8\n\t"
"vfmaxdb %%v16,%%v16,%%v20,8\n\t"
"vfmaxdb %%v17,%%v17,%%v21,8\n\t"
"vfmaxdb %%v18,%%v18,%%v22,8\n\t"
"vfmaxdb %%v19,%%v19,%%v23,8\n\t"
"vfmaxdb %%v16,%%v16,%%v18,8\n\t"
"vfmaxdb %%v17,%%v17,%%v19,8\n\t"
"vfmaxdb %%v16,%%v16,%%v17,8\n\t"
"vfmaxdb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,8\n\t"
"lpdr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return amax;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxdb %%v16,%%v16,%%v24,8 \n\t"
"vfmaxdb %%v17,%%v17,%%v25,8 \n\t"
"vfmaxdb %%v18,%%v18,%%v26,8 \n\t"
"vfmaxdb %%v19,%%v19,%%v27,8 \n\t"
"vfmaxdb %%v20,%%v20,%%v28,8 \n\t"
"vfmaxdb %%v21,%%v21,%%v29,8 \n\t"
"vfmaxdb %%v22,%%v22,%%v30,8 \n\t"
"vfmaxdb %%v23,%%v23,%%v31,8 \n\t"
"vfmaxdb %%v16,%%v16,%%v20,8 \n\t"
"vfmaxdb %%v17,%%v17,%%v21,8 \n\t"
"vfmaxdb %%v18,%%v18,%%v22,8 \n\t"
"vfmaxdb %%v19,%%v19,%%v23,8 \n\t"
"vfmaxdb %%v16,%%v16,%%v18,8 \n\t"
"vfmaxdb %%v17,%%v17,%%v19,8 \n\t"
"vfmaxdb %%v16,%%v16,%%v17,8 \n\t"
"vfmaxdb %%v0,%%v0,%%v16,8 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfmaxdb %%v0,%%v0,%%v16,8 \n\t"
"lpdr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amax;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
maxf = damax_kernel_32(n1, x); maxf = damax_kernel_32(n1, x);
i = n1;
}
else
{
maxf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
i = n1;
} else { } else {
maxf = ABS(x[0]);
maxf=ABS(x[0]); i++;
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
} }
while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
} else {
maxf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
{ FLOAT amax;
FLOAT amax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "vflpdb %%v0,%%v0\n\t"
"vflpdb %%v0,%%v0 \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return amax;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amax;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
maxf = damax_kernel_32(n1, x); maxf = damax_kernel_32(n1, x);
i = n1;
}
else
{
maxf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
i = n1;
} else { } else {
maxf = ABS(x[0]);
maxf=ABS(x[0]); i++;
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
} }
while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
} else {
maxf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
{ FLOAT amin;
FLOAT amin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmindb %%v16,%%v16,%%v24,8\n\t"
"vfmindb %%v17,%%v17,%%v25,8\n\t"
"vfmindb %%v18,%%v18,%%v26,8\n\t"
"vfmindb %%v19,%%v19,%%v27,8\n\t"
"vfmindb %%v20,%%v20,%%v28,8\n\t"
"vfmindb %%v21,%%v21,%%v29,8\n\t"
"vfmindb %%v22,%%v22,%%v30,8\n\t"
"vfmindb %%v23,%%v23,%%v31,8\n\t"
"vfmindb %%v16,%%v16,%%v20,8\n\t"
"vfmindb %%v17,%%v17,%%v21,8\n\t"
"vfmindb %%v18,%%v18,%%v22,8\n\t"
"vfmindb %%v19,%%v19,%%v23,8\n\t"
"vfmindb %%v16,%%v16,%%v18,8\n\t"
"vfmindb %%v17,%%v17,%%v19,8\n\t"
"vfmindb %%v16,%%v16,%%v17,8\n\t"
"vfmindb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,8\n\t"
"lpdr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return amin;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmindb %%v16,%%v16,%%v24,8 \n\t"
"vfmindb %%v17,%%v17,%%v25,8 \n\t"
"vfmindb %%v18,%%v18,%%v26,8 \n\t"
"vfmindb %%v19,%%v19,%%v27,8 \n\t"
"vfmindb %%v20,%%v20,%%v28,8 \n\t"
"vfmindb %%v21,%%v21,%%v29,8 \n\t"
"vfmindb %%v22,%%v22,%%v30,8 \n\t"
"vfmindb %%v23,%%v23,%%v31,8 \n\t"
"vfmindb %%v16,%%v16,%%v20,8 \n\t"
"vfmindb %%v17,%%v17,%%v21,8 \n\t"
"vfmindb %%v18,%%v18,%%v22,8 \n\t"
"vfmindb %%v19,%%v19,%%v23,8 \n\t"
"vfmindb %%v16,%%v16,%%v18,8 \n\t"
"vfmindb %%v17,%%v17,%%v19,8 \n\t"
"vfmindb %%v16,%%v16,%%v17,8 \n\t"
"vfmindb %%v0,%%v0,%%v16,8 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfmindb %%v0,%%v0,%%v16,8 \n\t"
"lpdr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amin;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
minf = damin_kernel_32(n1, x); minf = damin_kernel_32(n1, x);
i = n1;
}
else
{
minf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
i = n1;
} else { } else {
minf = ABS(x[0]);
minf=ABS(x[0]); i++;
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
} }
while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
} else {
minf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
{ FLOAT amin;
FLOAT amin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "vflpdb %%v0,%%v0\n\t"
"vflpdb %%v0,%%v0 \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return amin;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amin;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
minf = damin_kernel_32(n1, x); minf = damin_kernel_32(n1, x);
i = n1;
}
else
{
minf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
i = n1;
} else { } else {
minf = ABS(x[0]);
minf=ABS(x[0]); i++;
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
} }
while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
} else {
minf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,145 +28,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define ABS fabs
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
{ FLOAT asum;
FLOAT asum;
__asm__ ( __asm__("vzero %%v24\n\t"
"vzero %%v0 \n\t" "vzero %%v25\n\t"
"vzero %%v1 \n\t" "vzero %%v26\n\t"
"vzero %%v2 \n\t" "vzero %%v27\n\t"
"vzero %%v3 \n\t" "vzero %%v28\n\t"
"srlg %%r0,%1,5 \n\t" "vzero %%v29\n\t"
"xgr %%r1,%%r1 \n\t" "vzero %%v30\n\t"
"0: \n\t" "vzero %%v31\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "srlg %[n],%[n],5\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "xgr %%r1,%%r1\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "0:\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%2) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vflpdb %%v16, %%v16 \n\t" return asum;
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v2 \n\t"
"vfadb %%v0,%%v0,%%v3 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
BLASLONG n1; BLASLONG n1;
if (n <= 0 || inc_x <= 0) return sumf; if (n <= 0 || inc_x <= 0)
return sumf;
if (inc_x == 1) { if (inc_x == 1) {
n1 = n & -32; n1 = n & -32;
if (n1 > 0) {
sumf = dasum_kernel_32(n1, x); if (n1 > 0) {
i = n1;
}
while (i < n) { sumf = dasum_kernel_32(n1, x);
sumf += ABS(x[i]); i = n1;
i++; }
}
} else { while (i < n) {
BLASLONG n1 = n & -4; sumf += ABS(x[i]);
register FLOAT sum1, sum2; i++;
sum1 = 0.0; }
sum2 = 0.0;
while (j < n1) {
sum1 += ABS(x[i]); } else {
sum2 += ABS(x[i + inc_x]); BLASLONG n1 = n & -4;
sum1 += ABS(x[i + 2 * inc_x]); register FLOAT sum1, sum2;
sum2 += ABS(x[i + 3 * inc_x]); sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {
i += inc_x * 4; sum1 += ABS(x[i]);
j += 4; sum2 += ABS(x[i + inc_x]);
sum1 += ABS(x[i + 2 * inc_x]);
} sum2 += ABS(x[i + 3 * inc_x]);
sumf = sum1 + sum2;
while (j < n) {
sumf += ABS(x[i]);
i += inc_x;
j++;
}
i += inc_x * 4;
j += 4;
} }
return sumf; sumf = sum1 + sum2;
while (j < n) {
sumf += ABS(x[i]);
i += inc_x;
j++;
}
}
return sumf;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
{ __asm__("vlrepg %%v0,%[alpha]\n\t"
__asm__ volatile( "srlg %[n],%[n],5\n\t"
"vlrepg %%v0,%3 \n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,5 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"0: \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"pfd 1, 1024(%%r1,%1) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%1) \n\t" "vl %%v20,0(%%r1,%[y])\n\t"
"vl %%v18,32(%%r1,%1) \n\t" "vl %%v21,16(%%r1,%[y])\n\t"
"vl %%v19,48(%%r1,%1) \n\t" "vl %%v22,32(%%r1,%[y])\n\t"
"vl %%v20,0(%%r1,%2) \n\t" "vl %%v23,48(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%2) \n\t" "vl %%v24,64(%%r1,%[x])\n\t"
"vl %%v22,32(%%r1,%2) \n\t" "vl %%v25,80(%%r1,%[x])\n\t"
"vl %%v23,48(%%r1,%2) \n\t" "vl %%v26,96(%%r1,%[x])\n\t"
"vl %%v27,112(%%r1,%[x])\n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20 \n\t" "vl %%v28,64(%%r1,%[y])\n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21 \n\t" "vl %%v29,80(%%r1,%[y])\n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22 \n\t" "vl %%v30,96(%%r1,%[y])\n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23 \n\t" "vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
"vl %%v24,64(%%r1,%1) \n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
"vl %%v25,80(%%r1,%1) \n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
"vl %%v26,96(%%r1,%1) \n\t" "vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
"vl %%v27,112(%%r1,%1) \n\t" "vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
"vl %%v28,64(%%r1,%2) \n\t" "vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
"vl %%v29,80(%%r1,%2) \n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
"vl %%v30,96(%%r1,%2) \n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
"vl %%v31,112(%%r1,%2) \n\t" "vst %%v16,0(%%r1,%[y])\n\t"
"vst %%v17,16(%%r1,%[y])\n\t"
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" "vst %%v18,32(%%r1,%[y])\n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" "vst %%v19,48(%%r1,%[y])\n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t" "vst %%v24,64(%%r1,%[y])\n\t"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t" "vst %%v25,80(%%r1,%[y])\n\t"
"vst %%v26,96(%%r1,%[y])\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vst %%v27,112(%%r1,%[y])\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vl %%v16,128(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%2) \n\t" "vl %%v20,128(%%r1,%[y])\n\t"
"vst %%v22,96(%%r1,%2) \n\t" "vl %%v21,144(%%r1,%[y])\n\t"
"vst %%v23,112(%%r1,%2) \n\t" "vl %%v22,160(%%r1,%[y])\n\t"
"vl %%v23,176(%%r1,%[y])\n\t"
"vl %%v16,128(%%r1,%1) \n\t" "vl %%v24,192(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%1) \n\t" "vl %%v25,208(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%1) \n\t" "vl %%v26,224(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%1) \n\t" "vl %%v27,240(%%r1,%[x])\n\t"
"vl %%v20,128(%%r1,%2) \n\t" "vl %%v28,192(%%r1,%[y])\n\t"
"vl %%v21,144(%%r1,%2) \n\t" "vl %%v29,208(%%r1,%[y])\n\t"
"vl %%v22,160(%%r1,%2) \n\t" "vl %%v30,224(%%r1,%[y])\n\t"
"vl %%v23,176(%%r1,%2) \n\t" "vl %%v31,240(%%r1,%[y])\n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20 \n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21 \n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22 \n\t" "vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23 \n\t" "vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
"vl %%v24,192(%%r1,%1) \n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
"vl %%v25,208(%%r1,%1) \n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
"vl %%v26,224(%%r1,%1) \n\t" "vst %%v16,128(%%r1,%[y])\n\t"
"vl %%v27,240(%%r1,%1) \n\t" "vst %%v17,144(%%r1,%[y])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vst %%v18,160(%%r1,%[y])\n\t"
"vl %%v29,208(%%r1,%2) \n\t" "vst %%v19,176(%%r1,%[y])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vst %%v24,192(%%r1,%[y])\n\t"
"vl %%v31,240(%%r1,%2) \n\t" "vst %%v25,208(%%r1,%[y])\n\t"
"vst %%v26,224(%%r1,%[y])\n\t"
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" "vst %%v27,240(%%r1,%[y])\n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" "agfi %%r1,256\n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t" "brctg %[n],0b"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t" : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
"vst %%v16,128(%%r1,%2) \n\t" [alpha] "Q"(*alpha)
"vst %%v17,144(%%r1,%2) \n\t" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"vst %%v18,160(%%r1,%2) \n\t" "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
{ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG i=0; BLASLONG dummy2) {
BLASLONG ix=0,iy=0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
if ( n <= 0 ) return 0 ; if (n <= 0)
return 0;
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 ) if (n1)
daxpy_kernel_32(n1, x, y , &da); daxpy_kernel_32(n1, x, y, &da);
i = n1; i = n1;
while(i < n) while (i < n) {
{
y[i] += da * x[i] ;
i++ ;
}
return 0 ;
y[i] += da * x[i];
i++;
} }
return 0;
BLASLONG n1 = n & -4; }
while(i < n1) BLASLONG n1 = n & -4;
{
FLOAT m1 = da * x[ix] ; while (i < n1) {
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;
y[iy] += m1 ; FLOAT m1 = da * x[ix];
y[iy+inc_y] += m2 ; FLOAT m2 = da * x[ix + inc_x];
y[iy+2*inc_y] += m3 ; FLOAT m3 = da * x[ix + 2 * inc_x];
y[iy+3*inc_y] += m4 ; FLOAT m4 = da * x[ix + 3 * inc_x];
ix += inc_x*4 ; y[iy] += m1;
iy += inc_y*4 ; y[iy + inc_y] += m2;
i+=4 ; y[iy + 2 * inc_y] += m3;
y[iy + 3 * inc_y] += m4;
} ix += inc_x * 4;
iy += inc_y * 4;
i += 4;
while(i < n) }
{
y[iy] += da * x[ix] ; while (i < n) {
ix += inc_x ;
iy += inc_y ;
i++ ;
} y[iy] += da * x[ix];
return 0 ; ix += inc_x;
iy += inc_y;
i++;
}
return 0;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],5\n\t"
__asm__ volatile ( "0:\n\t"
"lgr %%r1,%1 \n\t" "pfd 1, 1024(%[x])\n\t"
"lgr %%r2,%2 \n\t" "pfd 2, 1024(%[y])\n\t"
"srlg %%r0,%0,5 \n\t" "mvc 0(256,%[y]),0(%[x])\n\t"
"0: \n\t" "la %[x],256(%[x])\n\t"
"pfd 1, 1024(%%r1) \n\t" "la %[y],256(%[y])\n\t"
"pfd 2, 1024(%%r2) \n\t" "brctg %[n],0b"
"mvc 0(256,%%r2),0(%%r1) \n\t" : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
"agfi %%r1,256 \n\t" : "m"(*(const struct { FLOAT x[n]; } *) x)
"agfi %%r2,256 \n\t" : "cc");
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","r2"
);
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
if (n <= 0) return 0; if (n <= 0)
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
dcopy_kernel_32(n1, x, y);
i = n1;
}
while (i < n) {
y[i] = x[i];
i++;
}
} else {
while (i < n) {
y[iy] = x[ix];
ix += inc_x;
iy += inc_y;
i++;
}
}
return 0; return 0;
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
dcopy_kernel_32(n1, x, y);
i = n1;
}
while (i < n) {
y[i] = x[i];
i++;
}
} else {
while (i < n) {
y[iy] = x[ix];
ix += inc_x;
iy += inc_y;
i++;
}
}
return 0;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,123 +27,127 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
{ FLOAT dot;
FLOAT dot;
__asm__ volatile ( __asm__("vzero %%v0\n\t"
"vzero %%v0 \n\t" "vzero %%v1\n\t"
"srlg %%r0,%1,4 \n\t" "vzero %%v2\n\t"
"xgr %%r1,%%r1 \n\t" "vzero %%v3\n\t"
"0: \n\t" "vzero %%v4\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "vzero %%v5\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t"
"ldr %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return dot;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%3) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return dot;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
{ BLASLONG i = 0;
BLASLONG i=0; BLASLONG ix = 0, iy = 0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ; FLOAT dot = 0.0;
if ( n <= 0 ) return(dot); if (n <= 0)
return (dot);
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 ) if (n1)
dot = ddot_kernel_16(n1, x, y); dot = ddot_kernel_16(n1, x, y);
i = n1; i = n1;
while(i < n) while (i < n) {
{
dot += y[i] * x[i] ;
i++ ;
}
return(dot);
dot += y[i] * x[i];
i++;
} }
return (dot);
FLOAT temp1 = 0.0; }
FLOAT temp2 = 0.0;
BLASLONG n1 = n & -4; FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
while(i < n1) BLASLONG n1 = n & -4;
{
FLOAT m1 = y[iy] * x[ix] ; while (i < n1) {
FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; FLOAT m1 = y[iy] * x[ix];
FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; FLOAT m2 = y[iy + inc_y] * x[ix + inc_x];
ix += inc_x*4 ; FLOAT m3 = y[iy + 2 * inc_y] * x[ix + 2 * inc_x];
iy += inc_y*4 ; FLOAT m4 = y[iy + 3 * inc_y] * x[ix + 3 * inc_x];
temp1 += m1+m3; ix += inc_x * 4;
temp2 += m2+m4; iy += inc_y * 4;
i+=4 ; temp1 += m1 + m3;
temp2 += m2 + m4;
} i += 4;
while(i < n) }
{
temp1 += y[iy] * x[ix] ; while (i < n) {
ix += inc_x ;
iy += inc_y ;
i++ ;
} temp1 += y[iy] * x[ix];
dot = temp1 + temp2; ix += inc_x;
return(dot); iy += inc_y;
i++;
}
dot = temp1 + temp2;
return (dot);
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,133 +27,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
{ FLOAT max;
FLOAT max;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
"vfmaxdb %%v17,%%v17,%%v25,0\n\t"
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
"vfmaxdb %%v19,%%v19,%%v27,0\n\t"
"vfmaxdb %%v20,%%v20,%%v28,0\n\t"
"vfmaxdb %%v21,%%v21,%%v29,0\n\t"
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
"vfmaxdb %%v23,%%v23,%%v31,0\n\t"
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
"vfmaxdb %%v17,%%v17,%%v21,0\n\t"
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
"vfmaxdb %%v19,%%v19,%%v23,0\n\t"
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
"vfmaxdb %%v17,%%v17,%%v19,0\n\t"
"vfmaxdb %%v16,%%v16,%%v17,0\n\t"
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
"ldr %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return max;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxdb %%v16,%%v16,%%v24,0 \n\t"
"vfmaxdb %%v17,%%v17,%%v25,0 \n\t"
"vfmaxdb %%v18,%%v18,%%v26,0 \n\t"
"vfmaxdb %%v19,%%v19,%%v27,0 \n\t"
"vfmaxdb %%v20,%%v20,%%v28,0 \n\t"
"vfmaxdb %%v21,%%v21,%%v29,0 \n\t"
"vfmaxdb %%v22,%%v22,%%v30,0 \n\t"
"vfmaxdb %%v23,%%v23,%%v31,0 \n\t"
"vfmaxdb %%v16,%%v16,%%v20,0 \n\t"
"vfmaxdb %%v17,%%v17,%%v21,0 \n\t"
"vfmaxdb %%v18,%%v18,%%v22,0 \n\t"
"vfmaxdb %%v19,%%v19,%%v23,0 \n\t"
"vfmaxdb %%v16,%%v16,%%v18,0 \n\t"
"vfmaxdb %%v17,%%v17,%%v19,0 \n\t"
"vfmaxdb %%v16,%%v16,%%v17,0 \n\t"
"vfmaxdb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfmaxdb %%v0,%%v0,%%v16,0 \n\t"
"ldr %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return max;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
maxf = dmax_kernel_32(n1, x); maxf = dmax_kernel_32(n1, x);
i = n1;
}
else
{
maxf=x[0];
i++;
}
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
i = n1;
} else { } else {
maxf = x[0];
maxf=x[0]; i++;
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
} }
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
} else {
maxf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
{ FLOAT max;
FLOAT max;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t"
"vfchdb %%v27,%%v22,%%v23\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return max;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return max;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
maxf = dmax_kernel_32(n1, x); maxf = dmax_kernel_32(n1, x);
i = n1;
}
else
{
maxf=x[0];
i++;
}
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
i = n1;
} else { } else {
maxf = x[0];
maxf=x[0]; i++;
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
} }
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
} else {
maxf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,133 +27,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
{ FLOAT min;
FLOAT min;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmindb %%v16,%%v16,%%v24,0\n\t"
"vfmindb %%v17,%%v17,%%v25,0\n\t"
"vfmindb %%v18,%%v18,%%v26,0\n\t"
"vfmindb %%v19,%%v19,%%v27,0\n\t"
"vfmindb %%v20,%%v20,%%v28,0\n\t"
"vfmindb %%v21,%%v21,%%v29,0\n\t"
"vfmindb %%v22,%%v22,%%v30,0\n\t"
"vfmindb %%v23,%%v23,%%v31,0\n\t"
"vfmindb %%v16,%%v16,%%v20,0\n\t"
"vfmindb %%v17,%%v17,%%v21,0\n\t"
"vfmindb %%v18,%%v18,%%v22,0\n\t"
"vfmindb %%v19,%%v19,%%v23,0\n\t"
"vfmindb %%v16,%%v16,%%v18,0\n\t"
"vfmindb %%v17,%%v17,%%v19,0\n\t"
"vfmindb %%v16,%%v16,%%v17,0\n\t"
"vfmindb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,0\n\t"
"ldr %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return min;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmindb %%v16,%%v16,%%v24,0 \n\t"
"vfmindb %%v17,%%v17,%%v25,0 \n\t"
"vfmindb %%v18,%%v18,%%v26,0 \n\t"
"vfmindb %%v19,%%v19,%%v27,0 \n\t"
"vfmindb %%v20,%%v20,%%v28,0 \n\t"
"vfmindb %%v21,%%v21,%%v29,0 \n\t"
"vfmindb %%v22,%%v22,%%v30,0 \n\t"
"vfmindb %%v23,%%v23,%%v31,0 \n\t"
"vfmindb %%v16,%%v16,%%v20,0 \n\t"
"vfmindb %%v17,%%v17,%%v21,0 \n\t"
"vfmindb %%v18,%%v18,%%v22,0 \n\t"
"vfmindb %%v19,%%v19,%%v23,0 \n\t"
"vfmindb %%v16,%%v16,%%v18,0 \n\t"
"vfmindb %%v17,%%v17,%%v19,0 \n\t"
"vfmindb %%v16,%%v16,%%v17,0 \n\t"
"vfmindb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfmindb %%v0,%%v0,%%v16,0 \n\t"
"ldr %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return min;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
minf = dmin_kernel_32(n1, x); minf = dmin_kernel_32(n1, x);
i = n1;
}
else
{
minf=x[0];
i++;
}
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
i = n1;
} else { } else {
minf = x[0];
minf=x[0]; i++;
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
} }
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
} else {
minf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
{ FLOAT min;
FLOAT min;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t"
"vfchdb %%v27,%%v23,%%v22\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return min;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"
"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"
"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"
"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return min;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
minf = dmin_kernel_32(n1, x); minf = dmin_kernel_32(n1, x);
i = n1;
}
else
{
minf=x[0];
i++;
}
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
i = n1;
} else { } else {
minf = x[0];
minf=x[0]; i++;
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
} }
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
} else {
minf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,220 +27,200 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
{ __asm__("vlrepg %%v0,%[c]\n\t"
__asm__ ( "vlrepg %%v1,%[s]\n\t"
"vlrepg %%v0,%3 \n\t" "srlg %[n],%[n],5\n\t"
"vlrepg %%v1,%4 \n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,5 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"0: \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "vl %%v24, 0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v24, 0(%%r1,%1) \n\t" "vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%1) \n\t" "vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%1) \n\t" "vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v27, 48(%%r1,%1) \n\t" "vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ /* 2nd parts */
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v28, 0(%%r1,%[x])\n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v29, 16(%%r1,%[x])\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v28, 0(%%r1,%1) \n\t" "vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v29, 16(%%r1,%1) \n\t" "vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v30, 32(%%r1,%1) \n\t" "vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v31, 48(%%r1,%1) \n\t" "vst %%v23, 48(%%r1,%[y])\n\t"
"vst %%v20, 0(%%r1,%2) \n\t" "vl %%v24, 64(%%r1,%[x])\n\t"
"vst %%v21, 16(%%r1,%2) \n\t" "vl %%v25, 80(%%r1,%[x])\n\t"
"vst %%v22, 32(%%r1,%2) \n\t" "vl %%v26, 96(%%r1,%[x])\n\t"
"vst %%v23, 48(%%r1,%2) \n\t" "vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%1) \n\t" "vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v25, 80(%%r1,%1) \n\t" "vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v26, 96(%%r1,%1) \n\t" "vl %%v19, 112(%%r1,%[y])\n\t"
"vl %%v27, 112(%%r1,%1) \n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vl %%v16, 64(%%r1,%2) \n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vl %%v17, 80(%%r1,%2) \n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vl %%v18, 96(%%r1,%2) \n\t" "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vl %%v19, 112(%%r1,%2) \n\t" "vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ /* 2nd parts */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
/* 2nd parts*/ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vst %%v28, 64(%%r1,%[x])\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vst %%v29, 80(%%r1,%[x])\n\t"
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v30, 96(%%r1,%[x])\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v31, 112(%%r1,%[x])\n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v28, 64(%%r1,%1) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v29, 80(%%r1,%1) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%1) \n\t" "vl %%v25, 144(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vl %%v16, 128(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vl %%v17, 144(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vl %%v27, 176(%%r1,%1) \n\t" "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vl %%v16, 128(%%r1,%2) \n\t" "vfmdb %%v30,%%v26,%%v0\n\t"
"vl %%v17, 144(%%r1,%2) \n\t" "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vl %%v18, 160(%%r1,%2) \n\t" "vfmdb %%v31,%%v27,%%v0\n\t"
"vl %%v19, 176(%%r1,%2) \n\t" "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vst %%v28, 128(%%r1,%[x])\n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vst %%v29, 144(%%r1,%[x])\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vst %%v30, 160(%%r1,%[x])\n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vst %%v31, 176(%%r1,%[x])\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vst %%v20, 128(%%r1,%[y])\n\t"
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v21, 144(%%r1,%[y])\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v22, 160(%%r1,%[y])\n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v23, 176(%%r1,%[y])\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vst %%v28, 128(%%r1,%1) \n\t" "vl %%v26, 224(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%1) \n\t" "vl %%v27, 240(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%1) \n\t" "vl %%v16, 192(%%r1,%[y])\n\t"
"vst %%v31, 176(%%r1,%1) \n\t" "vl %%v17, 208(%%r1,%[y])\n\t"
"vst %%v20, 128(%%r1,%2) \n\t" "vl %%v18, 224(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%2) \n\t" "vl %%v19, 240(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%2) \n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vst %%v23, 176(%%r1,%2) \n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vl %%v24, 192(%%r1,%1) \n\t" "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vl %%v25, 208(%%r1,%1) \n\t" "vfmdb %%v30,%%v26,%%v0\n\t"
"vl %%v26, 224(%%r1,%1) \n\t" "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vl %%v27, 240(%%r1,%1) \n\t" "vfmdb %%v31,%%v27,%%v0\n\t"
"vl %%v16, 192(%%r1,%2) \n\t" "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
"vl %%v17, 208(%%r1,%2) \n\t" /* 2nd parts */
"vl %%v18, 224(%%r1,%2) \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vl %%v19, 240(%%r1,%2) \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vst %%v28, 192(%%r1,%[x])\n\t"
"vfmdb %%v31,%%v27,%%v0 \n\t" "vst %%v29, 208(%%r1,%[x])\n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vst %%v30, 224(%%r1,%[x])\n\t"
/* 2nd parts*/ "vst %%v31, 240(%%r1,%[x])\n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vst %%v20, 192(%%r1,%[y])\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vst %%v21, 208(%%r1,%[y])\n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vst %%v22, 224(%%r1,%[y])\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vst %%v23, 240(%%r1,%[y])\n\t"
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "agfi %%r1,256\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "brctg %[n],0b"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ [n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
"vst %%v28, 192(%%r1,%1) \n\t" : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"vst %%v29, 208(%%r1,%1) \n\t" "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"vst %%v30, 224(%%r1,%1) \n\t" "v31");
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
{ FLOAT c, FLOAT s) {
BLASLONG i=0; BLASLONG i = 0;
BLASLONG ix=0,iy=0; BLASLONG ix = 0, iy = 0;
FLOAT temp;
if ( n <= 0 ) return(0); FLOAT temp;
if ( (inc_x == 1) && (inc_y == 1) ) if (n <= 0)
{ return (0);
BLASLONG n1 = n & -32; if ((inc_x == 1) && (inc_y == 1)) {
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
drot_kernel_32(n1, x, y, &cosa, &sina);
i=n1;
}
while(i < n) BLASLONG n1 = n & -32;
{ if (n1 > 0) {
temp = c*x[i] + s*y[i] ; FLOAT cosa, sina;
y[i] = c*y[i] - s*x[i] ; cosa = c;
x[i] = temp ; sina = s;
drot_kernel_32(n1, x, y, &cosa, &sina);
i = n1;
}
i++ ; while (i < n) {
temp = c * x[i] + s * y[i];
} y[i] = c * y[i] - s * x[i];
x[i] = temp;
i++;
} }
else
{
while(i < n) } else {
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;
ix += inc_x ; while (i < n) {
iy += inc_y ; temp = c * x[ix] + s * y[iy];
i++ ; y[iy] = c * y[iy] - s * x[ix];
x[ix] = temp;
} ix += inc_x;
iy += inc_y;
i++;
} }
return(0);
}
return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,179 +27,151 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) {
{ __asm__("vlrepg %%v0,%[da]\n\t"
__asm__ volatile ( "srlg %[n],%[n],4\n\t"
"vlrepg %%v0,%1 \n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,4 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"0: \n\t" "vl %%v24,0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vfmdb %%v24,%%v24,%%v0\n\t"
"vl %%v24, 0(%%r1,%2) \n\t" "vst %%v24,0(%%r1,%[x])\n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t" "vl %%v25,16(%%r1,%[x])\n\t"
"vst %%v24, 0(%%r1,%2) \n\t" "vfmdb %%v25,%%v25,%%v0\n\t"
"vl %%v25, 16(%%r1,%2) \n\t" "vst %%v25,16(%%r1,%[x])\n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t" "vl %%v26,32(%%r1,%[x])\n\t"
"vst %%v25, 16(%%r1,%2) \n\t" "vfmdb %%v26,%%v26,%%v0\n\t"
"vl %%v26, 32(%%r1,%2) \n\t" "vst %%v26,32(%%r1,%[x])\n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t" "vl %%v27,48(%%r1,%[x])\n\t"
"vst %%v26, 32(%%r1,%2) \n\t" "vfmdb %%v27,%%v27,%%v0\n\t"
"vl %%v27, 48(%%r1,%2) \n\t" "vst %%v27,48(%%r1,%[x])\n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t" "vl %%v28,64(%%r1,%[x])\n\t"
"vst %%v27, 48(%%r1,%2) \n\t" "vfmdb %%v28,%%v28,%%v0\n\t"
"vl %%v24, 64(%%r1,%2) \n\t" "vst %%v28,64(%%r1,%[x])\n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t" "vl %%v29,80(%%r1,%[x])\n\t"
"vst %%v24, 64(%%r1,%2) \n\t" "vfmdb %%v29,%%v29,%%v0\n\t"
"vl %%v25, 80(%%r1,%2) \n\t" "vst %%v29,80(%%r1,%[x])\n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t" "vl %%v30,96(%%r1,%[x])\n\t"
"vst %%v25, 80(%%r1,%2) \n\t" "vfmdb %%v30,%%v30,%%v0\n\t"
"vl %%v26, 96(%%r1,%2) \n\t" "vst %%v30,96(%%r1,%[x])\n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t" "vl %%v31,112(%%r1,%[x])\n\t"
"vst %%v26, 96(%%r1,%2) \n\t" "vfmdb %%v31,%%v31,%%v0\n\t"
"vl %%v27, 112(%%r1,%2) \n\t" "vst %%v31,112(%%r1,%[x])\n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t" "agfi %%r1,128\n\t"
"vst %%v27, 112(%%r1,%2) \n\t" "brctg %[n],0b"
"agfi %%r1,128 \n\t" : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
"brctg %%r0,0b " : [x] "a"(x),[da] "Q"(da)
: : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) "v31");
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
);
} }
static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
{ __asm__("vzero %%v0\n\t"
__asm__ volatile( "srlg %[n],%[n],4\n\t"
"vzero %%v24 \n\t" "xgr %%r1,%%r1\n\t"
"vzero %%v25 \n\t" "0:\n\t"
"vzero %%v26 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vzero %%v27 \n\t" "vst %%v0,0(%%r1,%[x])\n\t"
"srlg %%r0,%0,4 \n\t" "vst %%v0,16(%%r1,%[x])\n\t"
"xgr %%r1,%%r1 \n\t" "vst %%v0,32(%%r1,%[x])\n\t"
"0: \n\t" "vst %%v0,48(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v24,0(%%r1,%1) \n\t" "vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v25,16(%%r1,%1) \n\t" "vst %%v0,112(%%r1,%[x])\n\t"
"vst %%v26,32(%%r1,%1) \n\t" "agfi %%r1,128\n\t"
"vst %%v27,48(%%r1,%1) \n\t" "brctg %[n],0b"
"vst %%v24,64(%%r1,%1) \n\t" : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
"vst %%v25,80(%%r1,%1) \n\t" : [x] "a"(x)
"vst %%v26,96(%%r1,%1) \n\t" : "cc", "r1", "v0");
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
if ( n <= 0 || inc_x <=0 )
return(0);
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
if ( inc_x == 1 ) BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
{ BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
if (n <= 0 || inc_x <= 0)
return (0);
if ( da == 0.0 ) if (inc_x == 1) {
{
BLASLONG n1 = n & -16; if (da == 0.0) {
if ( n1 > 0 )
{
dscal_kernel_16_zero(n1, x);
j=n1;
}
while(j < n) BLASLONG n1 = n & -16;
{ if (n1 > 0) {
x[j]=0.0; dscal_kernel_16_zero(n1, x);
j++; j = n1;
} }
} while (j < n) {
else
{
BLASLONG n1 = n & -16; x[j] = 0.0;
if ( n1 > 0 ) j++;
{ }
dscal_kernel_16(n1, da, x);
j=n1;
}
while(j < n)
{
x[j] = da * x[j] ; } else {
j++;
}
}
BLASLONG n1 = n & -16;
if (n1 > 0) {
dscal_kernel_16(n1, da, x);
j = n1;
}
while (j < n) {
x[j] = da * x[j];
j++;
}
} }
else
{
if ( da == 0.0 ) } else {
{
BLASLONG n1 = n & -4; if (da == 0.0) {
while (j < n1) { BLASLONG n1 = n & -4;
x[i]=0.0; while (j < n1) {
x[i + inc_x]=0.0;
x[i + 2 * inc_x]=0.0;
x[i + 3 * inc_x]=0.0;
i += inc_x * 4; x[i] = 0.0;
j += 4; x[i + inc_x] = 0.0;
x[i + 2 * inc_x] = 0.0;
x[i + 3 * inc_x] = 0.0;
} i += inc_x * 4;
while(j < n) j += 4;
{
x[i]=0.0; }
i += inc_x ; while (j < n) {
j++;
}
} x[i] = 0.0;
else i += inc_x;
{ j++;
BLASLONG n1 = n & -4; }
while (j < n1) { } else {
BLASLONG n1 = n & -4;
x[i] = da * x[i] ; while (j < n1) {
x[i + inc_x] = da * x[i + inc_x];
x[i + 2 * inc_x] = da * x[i + 2 * inc_x];
x[i + 3 * inc_x] = da * x[i + 3 * inc_x];
i += inc_x * 4; x[i] = da * x[i];
j += 4; x[i + inc_x] = da * x[i + inc_x];
x[i + 2 * inc_x] = da * x[i + 2 * inc_x];
x[i + 3 * inc_x] = da * x[i + 3 * inc_x];
} i += inc_x * 4;
j += 4;
while(j < n) }
{
x[i] = da * x[i] ; while (j < n) {
i += inc_x ;
j++;
}
}
x[i] = da * x[i];
i += inc_x;
j++;
}
} }
return 0;
}
return 0;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project Copyright (c) 2013-2019,The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms,with or without Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are modification,are permitted provided that the following conditions are
@ -27,144 +27,147 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
{ double dot;
double dot;
__asm__ volatile ( __asm__("vzero %%v0\n\t"
"vzero %%v0 \n\t" "vzero %%v1\n\t"
"srlg %%r0,%1,4 \n\t" "vzero %%v2\n\t"
"xgr %%r1,%%r1 \n\t" "vzero %%v3\n\t"
"0: \n\t" "vzero %%v4\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "vzero %%v5\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vlef %%v16,0(%%r1,%[x]),0\n\t"
"vlef %%v16,4(%%r1,%[x]),2\n\t"
"vlef %%v17,8(%%r1,%[x]),0\n\t"
"vlef %%v17,12(%%r1,%[x]),2\n\t"
"vlef %%v18,16(%%r1,%[x]),0\n\t"
"vlef %%v18,20(%%r1,%[x]),2\n\t"
"vlef %%v19,24(%%r1,%[x]),0\n\t"
"vlef %%v19,28(%%r1,%[x]),2\n\t"
"vlef %%v20,32(%%r1,%[x]),0\n\t"
"vlef %%v20,36(%%r1,%[x]),2\n\t"
"vlef %%v21,40(%%r1,%[x]),0\n\t"
"vlef %%v21,44(%%r1,%[x]),2\n\t"
"vlef %%v22,48(%%r1,%[x]),0\n\t"
"vlef %%v22,52(%%r1,%[x]),2\n\t"
"vlef %%v23,56(%%r1,%[x]),0\n\t"
"vlef %%v23,60(%%r1,%[x]),2\n\t"
"vflls %%v16,%%v16\n\t"
"vflls %%v17,%%v17\n\t"
"vflls %%v18,%%v18\n\t"
"vflls %%v19,%%v19\n\t"
"vflls %%v20,%%v20\n\t"
"vflls %%v21,%%v21\n\t"
"vflls %%v22,%%v22\n\t"
"vflls %%v23,%%v23\n\t"
"vlef %%v24,0(%%r1,%[y]),0\n\t"
"vlef %%v24,4(%%r1,%[y]),2\n\t"
"vflls %%v24,%%v24\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vlef %%v25,8(%%r1,%[y]),0\n\t"
"vlef %%v25,12(%%r1,%[y]),2\n\t"
"vflls %%v25,%%v25\n\t"
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vlef %%v26,16(%%r1,%[y]),0\n\t"
"vlef %%v26,20(%%r1,%[y]),2\n\t"
"vflls %%v26,%%v26\n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vlef %%v27,24(%%r1,%[y]),0\n\t"
"vlef %%v27,28(%%r1,%[y]),2\n\t"
"vflls %%v27,%%v27\n\t"
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vlef %%v28,32(%%r1,%[y]),0\n\t"
"vlef %%v28,36(%%r1,%[y]),2\n\t"
"vflls %%v28,%%v28\n\t"
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vlef %%v29,40(%%r1,%[y]),0\n\t"
"vlef %%v29,44(%%r1,%[y]),2\n\t"
"vflls %%v29,%%v29\n\t"
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vlef %%v30,48(%%r1,%[y]),0\n\t"
"vlef %%v30,52(%%r1,%[y]),2\n\t"
"vflls %%v30,%%v30\n\t"
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vlef %%v31,56(%%r1,%[y]),0\n\t"
"vlef %%v31,60(%%r1,%[y]),2\n\t"
"vflls %%v31,%%v31\n\t"
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,64\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t"
"ldr %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vlef %%v16,0(%%r1,%2),0 \n\t" return dot;
"vlef %%v16,4(%%r1,%2),2 \n\t"
"vlef %%v17,8(%%r1,%2),0 \n\t"
"vlef %%v17,12(%%r1,%2),2 \n\t"
"vlef %%v18,16(%%r1,%2),0 \n\t"
"vlef %%v18,20(%%r1,%2),2 \n\t"
"vlef %%v19,24(%%r1,%2),0 \n\t"
"vlef %%v19,28(%%r1,%2),2 \n\t"
"vlef %%v20,32(%%r1,%2),0 \n\t"
"vlef %%v20,36(%%r1,%2),2 \n\t"
"vlef %%v21,40(%%r1,%2),0 \n\t"
"vlef %%v21,44(%%r1,%2),2 \n\t"
"vlef %%v22,48(%%r1,%2),0 \n\t"
"vlef %%v22,52(%%r1,%2),2 \n\t"
"vlef %%v23,56(%%r1,%2),0 \n\t"
"vlef %%v23,60(%%r1,%2),2 \n\t"
"vflls %%v16,%%v16 \n\t"
"vflls %%v17,%%v17 \n\t"
"vflls %%v18,%%v18 \n\t"
"vflls %%v19,%%v19 \n\t"
"vflls %%v20,%%v20 \n\t"
"vflls %%v21,%%v21 \n\t"
"vflls %%v22,%%v22 \n\t"
"vflls %%v23,%%v23 \n\t"
"vlef %%v24,0(%%r1,%3),0 \n\t"
"vlef %%v24,4(%%r1,%3),2 \n\t"
"vflls %%v24,%%v24 \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vlef %%v25,8(%%r1,%3),0 \n\t"
"vlef %%v25,12(%%r1,%3),2 \n\t"
"vflls %%v25,%%v25 \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"vlef %%v26,16(%%r1,%3),0 \n\t"
"vlef %%v26,20(%%r1,%3),2 \n\t"
"vflls %%v26,%%v26 \n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
"vlef %%v27,24(%%r1,%3),0 \n\t"
"vlef %%v27,28(%%r1,%3),2 \n\t"
"vflls %%v27,%%v27 \n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
"vlef %%v28,32(%%r1,%3),0 \n\t"
"vlef %%v28,36(%%r1,%3),2 \n\t"
"vflls %%v28,%%v28 \n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
"vlef %%v29,40(%%r1,%3),0 \n\t"
"vlef %%v29,44(%%r1,%3),2 \n\t"
"vflls %%v29,%%v29 \n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
"vlef %%v30,48(%%r1,%3),0 \n\t"
"vlef %%v30,52(%%r1,%3),2 \n\t"
"vflls %%v30,%%v30 \n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
"vlef %%v31,56(%%r1,%3),0 \n\t"
"vlef %%v31,60(%%r1,%3),2 \n\t"
"vflls %%v31,%%v31 \n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,64 \n\t"
"brctg %%r0,0b \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return dot;
} }
double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
{ BLASLONG i = 0;
BLASLONG i=0; BLASLONG ix = 0, iy = 0;
BLASLONG ix=0,iy=0;
double dot = 0.0 ; double dot = 0.0;
if ( n <= 0 ) return(dot); if (n <= 0)
return (dot);
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 ) if (n1)
dot = dsdot_kernel_16(n1,x,y); dot = dsdot_kernel_16(n1, x, y);
i = n1; i = n1;
while(i < n) while (i < n) {
{
dot += (double) y[i] * (double) x[i] ; dot += (double) y[i] * (double) x[i];
i++ ; i++;
} }
return(dot); return (dot);
}
} BLASLONG n1 = n & -2;
BLASLONG n1 = n & -2; while (i < n1) {
while(i < n1) dot += (double) y[iy] * (double) x[ix];
{ dot += (double) y[iy + inc_y] * (double) x[ix + inc_x];
ix += inc_x * 2;
iy += inc_y * 2;
i += 2;
dot += (double) y[iy] * (double) x[ix]; }
dot += (double) y[iy+inc_y] * (double) x[ix+inc_x];
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
} while (i < n) {
while(i < n) dot += (double) y[iy] * (double) x[ix];
{ ix += inc_x;
iy += inc_y;
i++;
dot += (double) y[iy] * (double) x[ix] ; }
ix += inc_x ; return (dot);
iy += inc_y ;
i++ ;
}
return(dot);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,136 +27,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],5\n\t"
__asm__ volatile( "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,5 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"0: \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "vl %%v16, 0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%1) \n\t" "vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%1) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%1) \n\t" "vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%1) \n\t" "vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%1) \n\t" "vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%1) \n\t" "vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v30, 224(%%r1,%1) \n\t" "vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v31, 240(%%r1,%1) \n\t" "vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v0, 0(%%r1,%2) \n\t" "vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%2) \n\t" "vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%2) \n\t" "vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%2) \n\t" "vl %%v7, 112(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%2) \n\t" "vst %%v0, 0(%%r1,%[x])\n\t"
"vl %%v5, 80(%%r1,%2) \n\t" "vst %%v1, 16(%%r1,%[x])\n\t"
"vl %%v6, 96(%%r1,%2) \n\t" "vst %%v2, 32(%%r1,%[x])\n\t"
"vl %%v7, 112(%%r1,%2) \n\t" "vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v0, 0(%%r1,%1) \n\t" "vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%1) \n\t" "vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%1) \n\t" "vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%1) \n\t" "vst %%v7, 112(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%1) \n\t" "vl %%v0, 128(%%r1,%[y])\n\t"
"vst %%v5, 80(%%r1,%1) \n\t" "vl %%v1, 144(%%r1,%[y])\n\t"
"vst %%v6, 96(%%r1,%1) \n\t" "vl %%v2, 160(%%r1,%[y])\n\t"
"vst %%v7, 112(%%r1,%1) \n\t" "vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v0, 128(%%r1,%2) \n\t" "vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%2) \n\t" "vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%2) \n\t" "vl %%v7, 240(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%2) \n\t" "vst %%v0, 128(%%r1,%[x])\n\t"
"vl %%v4, 192(%%r1,%2) \n\t" "vst %%v1, 144(%%r1,%[x])\n\t"
"vl %%v5, 208(%%r1,%2) \n\t" "vst %%v2, 160(%%r1,%[x])\n\t"
"vl %%v6, 224(%%r1,%2) \n\t" "vst %%v3, 176(%%r1,%[x])\n\t"
"vl %%v7, 240(%%r1,%2) \n\t" "vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v0, 128(%%r1,%1) \n\t" "vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%1) \n\t" "vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%1) \n\t" "vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%1) \n\t" "vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v4, 192(%%r1,%1) \n\t" "vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v5, 208(%%r1,%1) \n\t" "vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v6, 224(%%r1,%1) \n\t" "vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v7, 240(%%r1,%1) \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v16, 0(%%r1,%2) \n\t" "vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%2) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%2) \n\t" "vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%2) \n\t" "vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%2) \n\t" "vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%2) \n\t" "vst %%v31, 240(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%2) \n\t" "agfi %%r1,256\n\t"
"vst %%v27, 176(%%r1,%2) \n\t" "brctg %[n],0b"
"vst %%v28, 192(%%r1,%2) \n\t" : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
"vst %%v29, 208(%%r1,%2) \n\t" [n] "+&r"(n)
"vst %%v30, 224(%%r1,%2) \n\t" : [x] "a"(x),[y] "a"(y)
"vst %%v31, 240(%%r1,%2) \n\t" : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"agfi %%r1,256 \n\t" "v27", "v28", "v29", "v30", "v31");
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
{ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG i=0; BLASLONG dummy2) {
BLASLONG ix=0,iy=0; BLASLONG i = 0;
FLOAT temp; BLASLONG ix = 0, iy = 0;
FLOAT temp;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 )) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{ dswap_kernel_32(n1, x, y);
dswap_kernel_32(n1, x, y); i = n1;
i=n1; }
}
while(i < n)
{
temp = y[i];
y[i] = x[i] ;
x[i] = temp;
i++ ;
}
while (i < n) {
temp = y[i];
y[i] = x[i];
x[i] = temp;
i++;
} }
else
{
while(i < n) } else {
{
temp = y[iy];
y[iy] = x[ix] ;
x[ix] = temp;
ix += inc_x ;
iy += inc_y ;
i++ ;
} while (i < n) {
temp = y[iy];
y[iy] = x[ix];
x[ix] = temp;
ix += inc_x;
iy += inc_y;
i++;
} }
return(0);
}
return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) #define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
{
BLASLONG iamax;
__asm__ volatile ( static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
"vlef %%v0,0(%3),0 \n\t" BLASLONG iamax;
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v0,8(%3),1 \n\t"
"vlef %%v1,12(%3),1 \n\t"
"vlef %%v0,16(%3),2 \n\t"
"vlef %%v1,20(%3),2 \n\t"
"vlef %%v0,24(%3),3 \n\t"
"vlef %%v1,28(%3),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v1,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,16 \n\t"
"vzero %%v4 \n\t"
"vleib %%v9,0,0 \n\t"
"vleib %%v9,1,1 \n\t"
"vleib %%v9,2,2 \n\t"
"vleib %%v9,3,3 \n\t"
"vleib %%v9,8,4 \n\t"
"vleib %%v9,9,5 \n\t"
"vleib %%v9,10,6 \n\t"
"vleib %%v9,11,7 \n\t"
"vleib %%v9,16,8 \n\t"
"vleib %%v9,17,9 \n\t"
"vleib %%v9,18,10 \n\t"
"vleib %%v9,19,11 \n\t"
"vleib %%v9,24,12 \n\t"
"vleib %%v9,25,13 \n\t"
"vleib %%v9,26,14 \n\t"
"vleib %%v9,27,15 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t" __asm__("vlef %%v0,0(%[x]),0\n\t"
"vl %%v28,16(%%r1,%3) \n\t" "vlef %%v1,4(%[x]),0\n\t"
"vpkg %%v17,%%v16,%%v28 \n\t" "vlef %%v0,8(%[x]),1\n\t"
"vperm %%v16,%%v16,%%v28,%%v9 \n\t" "vlef %%v1,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v1,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v1,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v1,%%v1\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,16\n\t"
"vzero %%v4\n\t"
"vleib %%v9,0,0\n\t"
"vleib %%v9,1,1\n\t"
"vleib %%v9,2,2\n\t"
"vleib %%v9,3,3\n\t"
"vleib %%v9,8,4\n\t"
"vleib %%v9,9,5\n\t"
"vleib %%v9,10,6\n\t"
"vleib %%v9,11,7\n\t"
"vleib %%v9,16,8\n\t"
"vleib %%v9,17,9\n\t"
"vleib %%v9,18,10\n\t"
"vleib %%v9,19,11\n\t"
"vleib %%v9,24,12\n\t"
"vleib %%v9,25,13\n\t"
"vleib %%v9,26,14\n\t"
"vleib %%v9,27,15\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v28,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v29,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v30,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v28,144(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v29,176(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v30,208(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v18,32(%%r1,%3) \n\t" return iamax;
"vl %%v29,48(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29 \n\t"
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30 \n\t"
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31 \n\t"
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v28,144(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28 \n\t"
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v29,176(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29 \n\t"
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v30,208(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30 \n\t"
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v31,240(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31 \n\t"
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamax;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
{ BLASLONG i = 0;
BLASLONG i = 0; BLASLONG ix = 0;
BLASLONG ix = 0; FLOAT maxf = 0;
FLOAT maxf = 0; BLASLONG max = 0;
BLASLONG max = 0; BLASLONG inc_x2;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(max); if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) {
BLASLONG n1 = n & -32; if (inc_x == 1) {
if (n1 > 0) {
max = icamax_kernel_32(n1, x, &maxf); BLASLONG n1 = n & -32;
ix = n1 * 2; if (n1 > 0) {
i = n1;
}
else
{
maxf = CABS1(x,0);
ix += 2;
i++;
}
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (max + 1);
max = icamax_kernel_32(n1, x, &maxf);
ix = n1 * 2;
i = n1;
} else { } else {
maxf = CABS1(x, 0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (max + 1);
} else {
max = 0; max = 0;
maxf = CABS1(x,0); maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n) BLASLONG n1 = n & -4;
{ while (i < n1) {
if( CABS1(x,ix) > maxf )
{ if (CABS1(x, ix) > maxf) {
max = i; max = i;
maxf = CABS1(x,ix); maxf = CABS1(x, ix);
} }
ix += inc_x2; if (CABS1(x, ix + inc_x2) > maxf) {
i++; max = i + 1;
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) > maxf) {
max = i + 2;
maxf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) > maxf) {
max = i + 3;
maxf = CABS1(x, ix + 3 * inc_x2);
}
ix += inc_x2 * 4;
i += 4;
} }
return (max + 1);
while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (max + 1);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) #define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
{
BLASLONG iamin;
__asm__ volatile ( static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
"vlef %%v0,0(%3),0 \n\t" BLASLONG iamin;
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v0,8(%3),1 \n\t"
"vlef %%v1,12(%3),1 \n\t"
"vlef %%v0,16(%3),2 \n\t"
"vlef %%v1,20(%3),2 \n\t"
"vlef %%v0,24(%3),3 \n\t"
"vlef %%v1,28(%3),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v1,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,16 \n\t"
"vzero %%v4 \n\t"
"vleib %%v9,0,0 \n\t"
"vleib %%v9,1,1 \n\t"
"vleib %%v9,2,2 \n\t"
"vleib %%v9,3,3 \n\t"
"vleib %%v9,8,4 \n\t"
"vleib %%v9,9,5 \n\t"
"vleib %%v9,10,6 \n\t"
"vleib %%v9,11,7 \n\t"
"vleib %%v9,16,8 \n\t"
"vleib %%v9,17,9 \n\t"
"vleib %%v9,18,10 \n\t"
"vleib %%v9,19,11 \n\t"
"vleib %%v9,24,12 \n\t"
"vleib %%v9,25,13 \n\t"
"vleib %%v9,26,14 \n\t"
"vleib %%v9,27,15 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vl %%v16,0(%%r1,%3) \n\t" __asm__("vlef %%v0,0(%[x]),0\n\t"
"vl %%v28,16(%%r1,%3) \n\t" "vlef %%v1,4(%[x]),0\n\t"
"vpkg %%v17,%%v16,%%v28 \n\t" "vlef %%v0,8(%[x]),1\n\t"
"vperm %%v16,%%v16,%%v28,%%v9 \n\t" "vlef %%v1,12(%[x]),1\n\t"
"vlef %%v0,16(%[x]),2\n\t"
"vlef %%v1,20(%[x]),2\n\t"
"vlef %%v0,24(%[x]),3\n\t"
"vlef %%v1,28(%[x]),3\n\t"
"vflpsb %%v0,%%v0\n\t"
"vflpsb %%v1,%%v1\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t"
"vleig %%v2,3,1\n\t"
"vrepig %%v3,16\n\t"
"vzero %%v4\n\t"
"vleib %%v9,0,0\n\t"
"vleib %%v9,1,1\n\t"
"vleib %%v9,2,2\n\t"
"vleib %%v9,3,3\n\t"
"vleib %%v9,8,4\n\t"
"vleib %%v9,9,5\n\t"
"vleib %%v9,10,6\n\t"
"vleib %%v9,11,7\n\t"
"vleib %%v9,16,8\n\t"
"vleib %%v9,17,9\n\t"
"vleib %%v9,18,10\n\t"
"vleib %%v9,19,11\n\t"
"vleib %%v9,24,12\n\t"
"vleib %%v9,25,13\n\t"
"vleib %%v9,26,14\n\t"
"vleib %%v9,27,15\n\t"
"vleif %%v24,0,0\n\t"
"vleif %%v24,1,1\n\t"
"vleif %%v24,2,2\n\t"
"vleif %%v24,3,3\n\t"
"vleif %%v25,4,0\n\t"
"vleif %%v25,5,1\n\t"
"vleif %%v25,6,2\n\t"
"vleif %%v25,7,3\n\t"
"vleif %%v26,8,0\n\t"
"vleif %%v26,9,1\n\t"
"vleif %%v26,10,2\n\t"
"vleif %%v26,11,3\n\t"
"vleif %%v27,12,0\n\t"
"vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v28,16(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v29,48(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v30,80(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v28,144(%%r1,%[x])\n\t"
"vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v29,176(%%r1,%[x])\n\t"
"vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v30,208(%%r1,%[x])\n\t"
"vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
"v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v18,32(%%r1,%3) \n\t" return iamin;
"vl %%v29,48(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29 \n\t"
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30 \n\t"
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31 \n\t"
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v28,144(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28 \n\t"
"vperm %%v16,%%v16,%%v28,%%v9 \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v29,176(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29 \n\t"
"vperm %%v18,%%v18,%%v29,%%v9 \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v30,208(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30 \n\t"
"vperm %%v20,%%v20,%%v30,%%v9 \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v31,240(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31 \n\t"
"vperm %%v22,%%v22,%%v31,%%v9 \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamin;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
{ BLASLONG i = 0;
BLASLONG i = 0; BLASLONG ix = 0;
BLASLONG ix = 0; FLOAT minf = 0;
FLOAT minf = 0; BLASLONG min = 0;
BLASLONG min = 0; BLASLONG inc_x2;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(min); if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) {
BLASLONG n1 = n & -32; if (inc_x == 1) {
if (n1 > 0) {
min = icamin_kernel_32(n1, x, &minf); BLASLONG n1 = n & -32;
ix = n1 * 2; if (n1 > 0) {
i = n1;
}
else
{
minf = CABS1(x,0);
ix += 2;
i++;
}
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (min + 1);
min = icamin_kernel_32(n1, x, &minf);
ix = n1 * 2;
i = n1;
} else { } else {
minf = CABS1(x, 0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (min + 1);
} else {
min = 0; min = 0;
minf = CABS1(x,0); minf = CABS1(x, 0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n) BLASLONG n1 = n & -4;
{ while (i < n1) {
if( CABS1(x,ix) < minf )
{ if (CABS1(x, ix) < minf) {
min = i; min = i;
minf = CABS1(x,ix); minf = CABS1(x, ix);
} }
ix += inc_x2; if (CABS1(x, ix + inc_x2) < minf) {
i++; min = i + 1;
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) < minf) {
min = i + 2;
minf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) < minf) {
min = i + 3;
minf = CABS1(x, ix + 3 * inc_x2);
}
ix += inc_x2 * 4;
i += 4;
} }
return (min + 1);
while (i < n) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (min + 1);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
{ BLASLONG iamax;
BLASLONG iamax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t" "vflpdb %%v0,%%v0\n\t"
"vflpdb %%v0,%%v0 \n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,0,0 \n\t" "vleig %%v1,1,1\n\t"
"vleig %%v1,1,1 \n\t" "vrepig %%v2,16\n\t"
"vrepig %%v2,16 \n\t" "vzero %%v3\n\t"
"vzero %%v3 \n\t" "vleig %%v24,0,0\n\t"
"vleig %%v24,0,0 \n\t" "vleig %%v24,1,1\n\t"
"vleig %%v24,1,1 \n\t" "vleig %%v25,2,0\n\t"
"vleig %%v25,2,0 \n\t" "vleig %%v25,3,1\n\t"
"vleig %%v25,3,1 \n\t" "vleig %%v26,4,0\n\t"
"vleig %%v26,4,0 \n\t" "vleig %%v26,5,1\n\t"
"vleig %%v26,5,1 \n\t" "vleig %%v27,6,0\n\t"
"vleig %%v27,6,0 \n\t" "vleig %%v27,7,1\n\t"
"vleig %%v27,7,1 \n\t" "vleig %%v28,8,0\n\t"
"vleig %%v28,8,0 \n\t" "vleig %%v28,9,1\n\t"
"vleig %%v28,9,1 \n\t" "vleig %%v29,10,0\n\t"
"vleig %%v29,10,0 \n\t" "vleig %%v29,11,1\n\t"
"vleig %%v29,11,1 \n\t" "vleig %%v30,12,0\n\t"
"vleig %%v30,12,0 \n\t" "vleig %%v30,13,1\n\t"
"vleig %%v30,13,1 \n\t" "vleig %%v31,14,0\n\t"
"vleig %%v31,14,0 \n\t" "vleig %%v31,15,1\n\t"
"vleig %%v31,15,1 \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%3) \n\t" return iamax;
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vfchedb %%v6,%%v20,%%v21 \n\t"
"vfchedb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v16,%%v17 \n\t"
"vfchedb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vfchedb %%v6,%%v20,%%v21 \n\t"
"vfchedb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v16,%%v17 \n\t"
"vfchedb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamax;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG max = 0; BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max); if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
max = idamax_kernel_32(n1, x, &maxf); max = idamax_kernel_32(n1, x, &maxf);
i = n1;
}
else
{
maxf = ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;
maxf = ABS(x[i]);
}
i++;
}
return (max + 1);
i = n1;
} else { } else {
maxf = ABS(x[0]);
max = 0; i++;
maxf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
max = j + 1;
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
max = j + 2;
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
max = j + 3;
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (max + 1);
} }
while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;
maxf = ABS(x[i]);
}
i++;
}
return (max + 1);
} else {
max = 0;
maxf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
max = j + 1;
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
max = j + 2;
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
max = j + 3;
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (max + 1);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
{ BLASLONG iamin;
BLASLONG iamin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t" "vflpdb %%v0,%%v0\n\t"
"vflpdb %%v0,%%v0 \n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,0,0 \n\t" "vleig %%v1,1,1\n\t"
"vleig %%v1,1,1 \n\t" "vrepig %%v2,16\n\t"
"vrepig %%v2,16 \n\t" "vzero %%v3\n\t"
"vzero %%v3 \n\t" "vleig %%v24,0,0\n\t"
"vleig %%v24,0,0 \n\t" "vleig %%v24,1,1\n\t"
"vleig %%v24,1,1 \n\t" "vleig %%v25,2,0\n\t"
"vleig %%v25,2,0 \n\t" "vleig %%v25,3,1\n\t"
"vleig %%v25,3,1 \n\t" "vleig %%v26,4,0\n\t"
"vleig %%v26,4,0 \n\t" "vleig %%v26,5,1\n\t"
"vleig %%v26,5,1 \n\t" "vleig %%v27,6,0\n\t"
"vleig %%v27,6,0 \n\t" "vleig %%v27,7,1\n\t"
"vleig %%v27,7,1 \n\t" "vleig %%v28,8,0\n\t"
"vleig %%v28,8,0 \n\t" "vleig %%v28,9,1\n\t"
"vleig %%v28,9,1 \n\t" "vleig %%v29,10,0\n\t"
"vleig %%v29,10,0 \n\t" "vleig %%v29,11,1\n\t"
"vleig %%v29,11,1 \n\t" "vleig %%v30,12,0\n\t"
"vleig %%v30,12,0 \n\t" "vleig %%v30,13,1\n\t"
"vleig %%v30,13,1 \n\t" "vleig %%v31,14,0\n\t"
"vleig %%v31,14,0 \n\t" "vleig %%v31,15,1\n\t"
"vleig %%v31,15,1 \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%3) \n\t" return iamin;
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vfchedb %%v6,%%v21,%%v20 \n\t"
"vfchedb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v17,%%v16 \n\t"
"vfchedb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vfchedb %%v6,%%v21,%%v20 \n\t"
"vfchedb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v17,%%v16 \n\t"
"vfchedb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamin;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG min = 0; BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min); if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
min = idamin_kernel_32(n1, x, &minf); min = idamin_kernel_32(n1, x, &minf);
i = n1;
}
else
{
minf = ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) < minf) {
min = i;
minf = ABS(x[i]);
}
i++;
}
return (min + 1);
i = n1;
} else { } else {
minf = ABS(x[0]);
min = 0; i++;
minf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
min = j + 1;
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
min = j + 2;
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
min = j + 3;
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (min + 1);
} }
while (i < n) {
if (ABS(x[i]) < minf) {
min = i;
minf = ABS(x[i]);
}
i++;
}
return (min + 1);
} else {
min = 0;
minf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
min = j + 1;
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
min = j + 2;
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
min = j + 3;
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (min + 1);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) {
{ BLASLONG imax;
BLASLONG imax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,0,0 \n\t" "vleig %%v1,1,1\n\t"
"vleig %%v1,1,1 \n\t" "vrepig %%v2,16\n\t"
"vrepig %%v2,16 \n\t" "vzero %%v3\n\t"
"vzero %%v3 \n\t" "vleig %%v24,0,0\n\t"
"vleig %%v24,0,0 \n\t" "vleig %%v24,1,1\n\t"
"vleig %%v24,1,1 \n\t" "vleig %%v25,2,0\n\t"
"vleig %%v25,2,0 \n\t" "vleig %%v25,3,1\n\t"
"vleig %%v25,3,1 \n\t" "vleig %%v26,4,0\n\t"
"vleig %%v26,4,0 \n\t" "vleig %%v26,5,1\n\t"
"vleig %%v26,5,1 \n\t" "vleig %%v27,6,0\n\t"
"vleig %%v27,6,0 \n\t" "vleig %%v27,7,1\n\t"
"vleig %%v27,7,1 \n\t" "vleig %%v28,8,0\n\t"
"vleig %%v28,8,0 \n\t" "vleig %%v28,9,1\n\t"
"vleig %%v28,9,1 \n\t" "vleig %%v29,10,0\n\t"
"vleig %%v29,10,0 \n\t" "vleig %%v29,11,1\n\t"
"vleig %%v29,11,1 \n\t" "vleig %%v30,12,0\n\t"
"vleig %%v30,12,0 \n\t" "vleig %%v30,13,1\n\t"
"vleig %%v30,13,1 \n\t" "vleig %%v31,14,0\n\t"
"vleig %%v31,14,0 \n\t" "vleig %%v31,15,1\n\t"
"vleig %%v31,15,1 \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t"
"vfchedb %%v7,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[max],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[max]\n\t"
"vlgvg %[imax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%3) \n\t" return imax;
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vfchedb %%v6,%%v20,%%v21 \n\t"
"vfchedb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v16,%%v17 \n\t"
"vfchedb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vfchedb %%v6,%%v20,%%v21 \n\t"
"vfchedb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v16,%%v17 \n\t"
"vfchedb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(imax),"=m"(*max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return imax;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG max = 0; BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max); if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
max = idmax_kernel_32(n1, x, &maxf); max = idmax_kernel_32(n1, x, &maxf);
i = n1;
}
else
{
maxf = x[0];
i++;
}
while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);
i = n1;
} else { } else {
maxf = x[0];
max = 0; i++;
maxf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
}
return (max + 1);
} }
while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);
} else {
max = 0;
maxf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
}
return (max + 1);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) {
{ BLASLONG imin;
BLASLONG imin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,0,0 \n\t" "vleig %%v1,1,1\n\t"
"vleig %%v1,1,1 \n\t" "vrepig %%v2,16\n\t"
"vrepig %%v2,16 \n\t" "vzero %%v3\n\t"
"vzero %%v3 \n\t" "vleig %%v24,0,0\n\t"
"vleig %%v24,0,0 \n\t" "vleig %%v24,1,1\n\t"
"vleig %%v24,1,1 \n\t" "vleig %%v25,2,0\n\t"
"vleig %%v25,2,0 \n\t" "vleig %%v25,3,1\n\t"
"vleig %%v25,3,1 \n\t" "vleig %%v26,4,0\n\t"
"vleig %%v26,4,0 \n\t" "vleig %%v26,5,1\n\t"
"vleig %%v26,5,1 \n\t" "vleig %%v27,6,0\n\t"
"vleig %%v27,6,0 \n\t" "vleig %%v27,7,1\n\t"
"vleig %%v27,7,1 \n\t" "vleig %%v28,8,0\n\t"
"vleig %%v28,8,0 \n\t" "vleig %%v28,9,1\n\t"
"vleig %%v28,9,1 \n\t" "vleig %%v29,10,0\n\t"
"vleig %%v29,10,0 \n\t" "vleig %%v29,11,1\n\t"
"vleig %%v29,11,1 \n\t" "vleig %%v30,12,0\n\t"
"vleig %%v30,12,0 \n\t" "vleig %%v30,13,1\n\t"
"vleig %%v30,13,1 \n\t" "vleig %%v31,14,0\n\t"
"vleig %%v31,14,0 \n\t" "vleig %%v31,15,1\n\t"
"vleig %%v31,15,1 \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t"
"vfchedb %%v7,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vsel %%v18,%%v20,%%v21,%%v6\n\t"
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[min],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[min]\n\t"
"vlgvg %[imin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%3) \n\t" return imin;
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vfchedb %%v6,%%v21,%%v20 \n\t"
"vfchedb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v17,%%v16 \n\t"
"vfchedb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vfchedb %%v6,%%v21,%%v20 \n\t"
"vfchedb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"
"vfchedb %%v20,%%v17,%%v16 \n\t"
"vfchedb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"
"vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(imin),"=m"(*min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return imin;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG min = 0; BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min); if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
min = idmin_kernel_32(n1, x, &minf); min = idmin_kernel_32(n1, x, &minf);
i = n1;
}
else
{
minf = x[0];
i++;
}
while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);
i = n1;
} else { } else {
minf = x[0];
min = 0; i++;
minf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
}
return (min + 1);
} }
while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);
} else {
min = 0;
minf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
}
return (min + 1);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif
static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) {
{ BLASLONG iamax;
BLASLONG iamax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t" "vflpsb %%v0,%%v0\n\t"
"vflpsb %%v0,%%v0 \n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,0,0 \n\t" "vleig %%v1,2,1\n\t"
"vleig %%v1,2,1 \n\t" "vleig %%v2,1,0\n\t"
"vleig %%v2,1,0 \n\t" "vleig %%v2,3,1\n\t"
"vleig %%v2,3,1 \n\t" "vrepig %%v3,32\n\t"
"vrepig %%v3,32 \n\t" "vzero %%v4\n\t"
"vzero %%v4 \n\t" "vleif %%v24,0,0\n\t"
"vleif %%v24,0,0 \n\t" "vleif %%v24,1,1\n\t"
"vleif %%v24,1,1 \n\t" "vleif %%v24,2,2\n\t"
"vleif %%v24,2,2 \n\t" "vleif %%v24,3,3\n\t"
"vleif %%v24,3,3 \n\t" "vleif %%v25,4,0\n\t"
"vleif %%v25,4,0 \n\t" "vleif %%v25,5,1\n\t"
"vleif %%v25,5,1 \n\t" "vleif %%v25,6,2\n\t"
"vleif %%v25,6,2 \n\t" "vleif %%v25,7,3\n\t"
"vleif %%v25,7,3 \n\t" "vleif %%v26,8,0\n\t"
"vleif %%v26,8,0 \n\t" "vleif %%v26,9,1\n\t"
"vleif %%v26,9,1 \n\t" "vleif %%v26,10,2\n\t"
"vleif %%v26,10,2 \n\t" "vleif %%v26,11,3\n\t"
"vleif %%v26,11,3 \n\t" "vleif %%v27,12,0\n\t"
"vleif %%v27,12,0 \n\t" "vleif %%v27,13,1\n\t"
"vleif %%v27,13,1 \n\t" "vleif %%v27,14,2\n\t"
"vleif %%v27,14,2 \n\t" "vleif %%v27,15,3\n\t"
"vleif %%v27,15,3 \n\t" "vleif %%v28,16,0\n\t"
"vleif %%v28,16,0 \n\t" "vleif %%v28,17,1\n\t"
"vleif %%v28,17,1 \n\t" "vleif %%v28,18,2\n\t"
"vleif %%v28,18,2 \n\t" "vleif %%v28,19,3\n\t"
"vleif %%v28,19,3 \n\t" "vleif %%v29,20,0\n\t"
"vleif %%v29,20,0 \n\t" "vleif %%v29,21,1\n\t"
"vleif %%v29,21,1 \n\t" "vleif %%v29,22,2\n\t"
"vleif %%v29,22,2 \n\t" "vleif %%v29,23,3\n\t"
"vleif %%v29,23,3 \n\t" "vleif %%v30,24,0\n\t"
"vleif %%v30,24,0 \n\t" "vleif %%v30,25,1\n\t"
"vleif %%v30,25,1 \n\t" "vleif %%v30,26,2\n\t"
"vleif %%v30,26,2 \n\t" "vleif %%v30,27,3\n\t"
"vleif %%v30,27,3 \n\t" "vleif %%v31,28,0\n\t"
"vleif %%v31,28,0 \n\t" "vleif %%v31,29,1\n\t"
"vleif %%v31,29,1 \n\t" "vleif %%v31,30,2\n\t"
"vleif %%v31,30,2 \n\t" "vleif %%v31,31,3\n\t"
"vleif %%v31,31,3 \n\t" "srlg %[n],%[n],6\n\t"
"srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%3) \n\t" return iamax;
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vfchesb %%v7,%%v20,%%v21 \n\t"
"vfchesb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v16,%%v17 \n\t"
"vfchesb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vfchesb %%v7,%%v20,%%v21 \n\t"
"vfchesb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v16,%%v17 \n\t"
"vfchesb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamax;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG max = 0; BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max); if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
if (n1 > 0) { if (n1 > 0) {
max = isamax_kernel_64(n1, x, &maxf); max = isamax_kernel_64(n1, x, &maxf);
i = n1;
}
else
{
maxf = ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;
maxf = ABS(x[i]);
}
i++;
}
return (max + 1);
i = n1;
} else { } else {
maxf = ABS(x[0]);
max = 0; i++;
maxf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
max = j + 1;
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
max = j + 2;
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
max = j + 3;
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (max + 1);
} }
while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;
maxf = ABS(x[i]);
}
i++;
}
return (max + 1);
} else {
max = 0;
maxf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
max = j + 1;
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
max = j + 2;
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
max = j + 3;
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (max + 1);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif
static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) {
{ BLASLONG iamin;
BLASLONG iamin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t" "vflpsb %%v0,%%v0\n\t"
"vflpsb %%v0,%%v0 \n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,0,0 \n\t" "vleig %%v1,2,1\n\t"
"vleig %%v1,2,1 \n\t" "vleig %%v2,1,0\n\t"
"vleig %%v2,1,0 \n\t" "vleig %%v2,3,1\n\t"
"vleig %%v2,3,1 \n\t" "vrepig %%v3,32\n\t"
"vrepig %%v3,32 \n\t" "vzero %%v4\n\t"
"vzero %%v4 \n\t" "vleif %%v24,0,0\n\t"
"vleif %%v24,0,0 \n\t" "vleif %%v24,1,1\n\t"
"vleif %%v24,1,1 \n\t" "vleif %%v24,2,2\n\t"
"vleif %%v24,2,2 \n\t" "vleif %%v24,3,3\n\t"
"vleif %%v24,3,3 \n\t" "vleif %%v25,4,0\n\t"
"vleif %%v25,4,0 \n\t" "vleif %%v25,5,1\n\t"
"vleif %%v25,5,1 \n\t" "vleif %%v25,6,2\n\t"
"vleif %%v25,6,2 \n\t" "vleif %%v25,7,3\n\t"
"vleif %%v25,7,3 \n\t" "vleif %%v26,8,0\n\t"
"vleif %%v26,8,0 \n\t" "vleif %%v26,9,1\n\t"
"vleif %%v26,9,1 \n\t" "vleif %%v26,10,2\n\t"
"vleif %%v26,10,2 \n\t" "vleif %%v26,11,3\n\t"
"vleif %%v26,11,3 \n\t" "vleif %%v27,12,0\n\t"
"vleif %%v27,12,0 \n\t" "vleif %%v27,13,1\n\t"
"vleif %%v27,13,1 \n\t" "vleif %%v27,14,2\n\t"
"vleif %%v27,14,2 \n\t" "vleif %%v27,15,3\n\t"
"vleif %%v27,15,3 \n\t" "vleif %%v28,16,0\n\t"
"vleif %%v28,16,0 \n\t" "vleif %%v28,17,1\n\t"
"vleif %%v28,17,1 \n\t" "vleif %%v28,18,2\n\t"
"vleif %%v28,18,2 \n\t" "vleif %%v28,19,3\n\t"
"vleif %%v28,19,3 \n\t" "vleif %%v29,20,0\n\t"
"vleif %%v29,20,0 \n\t" "vleif %%v29,21,1\n\t"
"vleif %%v29,21,1 \n\t" "vleif %%v29,22,2\n\t"
"vleif %%v29,22,2 \n\t" "vleif %%v29,23,3\n\t"
"vleif %%v29,23,3 \n\t" "vleif %%v30,24,0\n\t"
"vleif %%v30,24,0 \n\t" "vleif %%v30,25,1\n\t"
"vleif %%v30,25,1 \n\t" "vleif %%v30,26,2\n\t"
"vleif %%v30,26,2 \n\t" "vleif %%v30,27,3\n\t"
"vleif %%v30,27,3 \n\t" "vleif %%v31,28,0\n\t"
"vleif %%v31,28,0 \n\t" "vleif %%v31,29,1\n\t"
"vleif %%v31,29,1 \n\t" "vleif %%v31,30,2\n\t"
"vleif %%v31,30,2 \n\t" "vleif %%v31,31,3\n\t"
"vleif %%v31,31,3 \n\t" "srlg %[n],%[n],6\n\t"
"srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%3) \n\t" return iamin;
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vfchesb %%v7,%%v21,%%v20 \n\t"
"vfchesb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v17,%%v16 \n\t"
"vfchesb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vfchesb %%v7,%%v21,%%v20 \n\t"
"vfchesb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v17,%%v16 \n\t"
"vfchesb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return iamin;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG min = 0; BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min); if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
if (n1 > 0) { if (n1 > 0) {
min = isamin_kernel_64(n1, x, &minf); min = isamin_kernel_64(n1, x, &minf);
i = n1;
}
else
{
minf = ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) < minf) {
min = i;
minf = ABS(x[i]);
}
i++;
}
return (min + 1);
i = n1;
} else { } else {
minf = ABS(x[0]);
min = 0; i++;
minf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
min = j + 1;
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
min = j + 2;
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
min = j + 3;
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (min + 1);
} }
while (i < n) {
if (ABS(x[i]) < minf) {
min = i;
minf = ABS(x[i]);
}
i++;
}
return (min + 1);
} else {
min = 0;
minf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
min = j + 1;
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
min = j + 2;
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
min = j + 3;
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (min + 1);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) {
{ BLASLONG imax;
BLASLONG imax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,0,0 \n\t" "vleig %%v1,2,1\n\t"
"vleig %%v1,2,1 \n\t" "vleig %%v2,1,0\n\t"
"vleig %%v2,1,0 \n\t" "vleig %%v2,3,1\n\t"
"vleig %%v2,3,1 \n\t" "vrepig %%v3,32\n\t"
"vrepig %%v3,32 \n\t" "vzero %%v4\n\t"
"vzero %%v4 \n\t" "vleif %%v24,0,0\n\t"
"vleif %%v24,0,0 \n\t" "vleif %%v24,1,1\n\t"
"vleif %%v24,1,1 \n\t" "vleif %%v24,2,2\n\t"
"vleif %%v24,2,2 \n\t" "vleif %%v24,3,3\n\t"
"vleif %%v24,3,3 \n\t" "vleif %%v25,4,0\n\t"
"vleif %%v25,4,0 \n\t" "vleif %%v25,5,1\n\t"
"vleif %%v25,5,1 \n\t" "vleif %%v25,6,2\n\t"
"vleif %%v25,6,2 \n\t" "vleif %%v25,7,3\n\t"
"vleif %%v25,7,3 \n\t" "vleif %%v26,8,0\n\t"
"vleif %%v26,8,0 \n\t" "vleif %%v26,9,1\n\t"
"vleif %%v26,9,1 \n\t" "vleif %%v26,10,2\n\t"
"vleif %%v26,10,2 \n\t" "vleif %%v26,11,3\n\t"
"vleif %%v26,11,3 \n\t" "vleif %%v27,12,0\n\t"
"vleif %%v27,12,0 \n\t" "vleif %%v27,13,1\n\t"
"vleif %%v27,13,1 \n\t" "vleif %%v27,14,2\n\t"
"vleif %%v27,14,2 \n\t" "vleif %%v27,15,3\n\t"
"vleif %%v27,15,3 \n\t" "vleif %%v28,16,0\n\t"
"vleif %%v28,16,0 \n\t" "vleif %%v28,17,1\n\t"
"vleif %%v28,17,1 \n\t" "vleif %%v28,18,2\n\t"
"vleif %%v28,18,2 \n\t" "vleif %%v28,19,3\n\t"
"vleif %%v28,19,3 \n\t" "vleif %%v29,20,0\n\t"
"vleif %%v29,20,0 \n\t" "vleif %%v29,21,1\n\t"
"vleif %%v29,21,1 \n\t" "vleif %%v29,22,2\n\t"
"vleif %%v29,22,2 \n\t" "vleif %%v29,23,3\n\t"
"vleif %%v29,23,3 \n\t" "vleif %%v30,24,0\n\t"
"vleif %%v30,24,0 \n\t" "vleif %%v30,25,1\n\t"
"vleif %%v30,25,1 \n\t" "vleif %%v30,26,2\n\t"
"vleif %%v30,26,2 \n\t" "vleif %%v30,27,3\n\t"
"vleif %%v30,27,3 \n\t" "vleif %%v31,28,0\n\t"
"vleif %%v31,28,0 \n\t" "vleif %%v31,29,1\n\t"
"vleif %%v31,29,1 \n\t" "vleif %%v31,30,2\n\t"
"vleif %%v31,30,2 \n\t" "vleif %%v31,31,3\n\t"
"vleif %%v31,31,3 \n\t" "srlg %[n],%[n],6\n\t"
"srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t"
"vfchesb %%v8,%%v22,%%v23\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[max],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[max]\n\t"
"vlgvg %[imax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%3) \n\t" return imax;
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vfchesb %%v7,%%v20,%%v21 \n\t"
"vfchesb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v16,%%v17 \n\t"
"vfchesb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchesb %%v5,%%v16,%%v17 \n\t"
"vfchesb %%v6,%%v18,%%v19 \n\t"
"vfchesb %%v7,%%v20,%%v21 \n\t"
"vfchesb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v16,%%v17 \n\t"
"vfchesb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(imax),"=m"(*max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return imax;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG max = 0; BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max); if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
if (n1 > 0) { if (n1 > 0) {
max = ismax_kernel_64(n1, x, &maxf); max = ismax_kernel_64(n1, x, &maxf);
i = n1;
}
else
{
maxf = x[0];
i++;
}
while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);
i = n1;
} else { } else {
maxf = x[0];
max = 0; i++;
maxf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
}
return (max + 1);
} }
while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);
} else {
max = 0;
maxf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
}
return (max + 1);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) {
{ BLASLONG imin;
BLASLONG imin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,0,0 \n\t" "vleig %%v1,2,1\n\t"
"vleig %%v1,2,1 \n\t" "vleig %%v2,1,0\n\t"
"vleig %%v2,1,0 \n\t" "vleig %%v2,3,1\n\t"
"vleig %%v2,3,1 \n\t" "vrepig %%v3,32\n\t"
"vrepig %%v3,32 \n\t" "vzero %%v4\n\t"
"vzero %%v4 \n\t" "vleif %%v24,0,0\n\t"
"vleif %%v24,0,0 \n\t" "vleif %%v24,1,1\n\t"
"vleif %%v24,1,1 \n\t" "vleif %%v24,2,2\n\t"
"vleif %%v24,2,2 \n\t" "vleif %%v24,3,3\n\t"
"vleif %%v24,3,3 \n\t" "vleif %%v25,4,0\n\t"
"vleif %%v25,4,0 \n\t" "vleif %%v25,5,1\n\t"
"vleif %%v25,5,1 \n\t" "vleif %%v25,6,2\n\t"
"vleif %%v25,6,2 \n\t" "vleif %%v25,7,3\n\t"
"vleif %%v25,7,3 \n\t" "vleif %%v26,8,0\n\t"
"vleif %%v26,8,0 \n\t" "vleif %%v26,9,1\n\t"
"vleif %%v26,9,1 \n\t" "vleif %%v26,10,2\n\t"
"vleif %%v26,10,2 \n\t" "vleif %%v26,11,3\n\t"
"vleif %%v26,11,3 \n\t" "vleif %%v27,12,0\n\t"
"vleif %%v27,12,0 \n\t" "vleif %%v27,13,1\n\t"
"vleif %%v27,13,1 \n\t" "vleif %%v27,14,2\n\t"
"vleif %%v27,14,2 \n\t" "vleif %%v27,15,3\n\t"
"vleif %%v27,15,3 \n\t" "vleif %%v28,16,0\n\t"
"vleif %%v28,16,0 \n\t" "vleif %%v28,17,1\n\t"
"vleif %%v28,17,1 \n\t" "vleif %%v28,18,2\n\t"
"vleif %%v28,18,2 \n\t" "vleif %%v28,19,3\n\t"
"vleif %%v28,19,3 \n\t" "vleif %%v29,20,0\n\t"
"vleif %%v29,20,0 \n\t" "vleif %%v29,21,1\n\t"
"vleif %%v29,21,1 \n\t" "vleif %%v29,22,2\n\t"
"vleif %%v29,22,2 \n\t" "vleif %%v29,23,3\n\t"
"vleif %%v29,23,3 \n\t" "vleif %%v30,24,0\n\t"
"vleif %%v30,24,0 \n\t" "vleif %%v30,25,1\n\t"
"vleif %%v30,25,1 \n\t" "vleif %%v30,26,2\n\t"
"vleif %%v30,26,2 \n\t" "vleif %%v30,27,3\n\t"
"vleif %%v30,27,3 \n\t" "vleif %%v31,28,0\n\t"
"vleif %%v31,28,0 \n\t" "vleif %%v31,29,1\n\t"
"vleif %%v31,29,1 \n\t" "vleif %%v31,30,2\n\t"
"vleif %%v31,30,2 \n\t" "vleif %%v31,31,3\n\t"
"vleif %%v31,31,3 \n\t" "srlg %[n],%[n],6\n\t"
"srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%[x])\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t"
"vfchesb %%v8,%%v23,%%v22\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vsel %%v18,%%v20,%%v21,%%v7\n\t"
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
"vsegf %%v6,%%v5\n\t"
"vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t"
"vesrlg %%v7,%%v7,32\n\t"
"vsegf %%v7,%%v7\n\t"
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t"
"vfcesb %%v6,%%v0,%%v3\n\t"
"vn %%v5,%%v5,%%v6\n\t"
"vo %%v4,%%v4,%%v5\n\t"
"vsel %%v0,%%v0,%%v3,%%v4\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vstef %%v0,%[min],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[imin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t"
"vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%[min]\n\t"
"vlgvg %[imin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%3) \n\t" return imin;
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vfchesb %%v7,%%v21,%%v20 \n\t"
"vfchesb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v17,%%v16 \n\t"
"vfchesb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchesb %%v5,%%v17,%%v16 \n\t"
"vfchesb %%v6,%%v19,%%v18 \n\t"
"vfchesb %%v7,%%v21,%%v20 \n\t"
"vfchesb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"
"vfchesb %%v20,%%v17,%%v16 \n\t"
"vfchesb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"
"vfchesb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"
"vfchesb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v1,%%v5,%%v7 \n\t"
"vsel %%v2,%%v2,%%v6,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"ste %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(imin),"=m"(*min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return imin;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG min = 0; BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min); if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
if (n1 > 0) { if (n1 > 0) {
min = ismin_kernel_64(n1, x, &minf); min = ismin_kernel_64(n1, x, &minf);
i = n1;
}
else
{
minf = x[0];
i++;
}
while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);
i = n1;
} else { } else {
minf = x[0];
min = 0; i++;
minf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
}
return (min + 1);
} }
while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);
} else {
min = 0;
minf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
}
return (min + 1);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
{
BLASLONG iamax;
__asm__ volatile ( static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) {
"vleg %%v0,0(%3),0 \n\t" BLASLONG iamax;
"vleg %%v1,8(%3),0 \n\t"
"vleg %%v0,16(%3),1 \n\t"
"vleg %%v1,24(%3),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v1,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,8 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"srlg %%r0,%2,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vleg %%v16,0(%%r1,%3),0 \n\t" __asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v17,8(%%r1,%3),0 \n\t" "vleg %%v1,8(%[x]),0\n\t"
"vleg %%v16,16(%%r1,%3),1 \n\t" "vleg %%v0,16(%[x]),1\n\t"
"vleg %%v17,24(%%r1,%3),1 \n\t" "vleg %%v1,24(%[x]),1\n\t"
"vleg %%v18,32(%%r1,%3),0 \n\t" "vflpdb %%v0,%%v0\n\t"
"vleg %%v19,40(%%r1,%3),0 \n\t" "vflpdb %%v1,%%v1\n\t"
"vleg %%v18,48(%%r1,%3),1 \n\t" "vfadb %%v0,%%v0,%%v1\n\t"
"vleg %%v19,56(%%r1,%3),1 \n\t" "vleig %%v1,0,0\n\t"
"vleg %%v20,64(%%r1,%3),0 \n\t" "vleig %%v1,1,1\n\t"
"vleg %%v21,72(%%r1,%3),0 \n\t" "vrepig %%v2,8\n\t"
"vleg %%v20,80(%%r1,%3),1 \n\t" "vzero %%v3\n\t"
"vleg %%v21,88(%%r1,%3),1 \n\t" "vleig %%v24,0,0\n\t"
"vleg %%v22,96(%%r1,%3),0 \n\t" "vleig %%v24,1,1\n\t"
"vleg %%v23,104(%%r1,%3),0 \n\t" "vleig %%v25,2,0\n\t"
"vleg %%v22,112(%%r1,%3),1 \n\t" "vleig %%v25,3,1\n\t"
"vleg %%v23,120(%%r1,%3),1 \n\t" "vleig %%v26,4,0\n\t"
"vflpdb %%v16, %%v16 \n\t" "vleig %%v26,5,1\n\t"
"vflpdb %%v17, %%v17 \n\t" "vleig %%v27,6,0\n\t"
"vflpdb %%v18, %%v18 \n\t" "vleig %%v27,7,1\n\t"
"vflpdb %%v19, %%v19 \n\t" "srlg %[n],%[n],4\n\t"
"vflpdb %%v20, %%v20 \n\t" "xgr %%r1,%%r1\n\t"
"vflpdb %%v21, %%v21 \n\t" "0:\n\t"
"vflpdb %%v22, %%v22 \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vflpdb %%v23, %%v23 \n\t" "vleg %%v16,0(%%r1,%[x]),0\n\t"
"vfadb %%v16,%%v16,%%v17 \n\t" "vleg %%v17,8(%%r1,%[x]),0\n\t"
"vfadb %%v17,%%v18,%%v19 \n\t" "vleg %%v16,16(%%r1,%[x]),1\n\t"
"vfadb %%v18,%%v20,%%v21 \n\t" "vleg %%v17,24(%%r1,%[x]),1\n\t"
"vfadb %%v19,%%v22,%%v23 \n\t" "vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t" "vleg %%v18,48(%%r1,%[x]),1\n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t" "vleg %%v19,56(%%r1,%[x]),1\n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vleg %%v20,64(%%r1,%[x]),0\n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vleg %%v21,72(%%r1,%[x]),0\n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vleg %%v20,80(%%r1,%[x]),1\n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t" "vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amax]\n\t"
"vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
"vfchedb %%v18,%%v16,%%v17 \n\t" return iamax;
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vleg %%v16,128(%%r1,%3),0 \n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchedb %%v4,%%v16,%%v17 \n\t"
"vfchedb %%v5,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vfchedb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return iamax;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
{ BLASLONG i = 0;
BLASLONG i = 0; BLASLONG ix = 0;
BLASLONG ix = 0; FLOAT maxf = 0;
FLOAT maxf = 0; BLASLONG max = 0;
BLASLONG max = 0; BLASLONG inc_x2;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(max); if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) {
BLASLONG n1 = n & -16; if (inc_x == 1) {
if (n1 > 0) {
max = izamax_kernel_16(n1, x, &maxf); BLASLONG n1 = n & -16;
ix = n1 * 2; if (n1 > 0) {
i = n1;
}
else
{
maxf = CABS1(x,0);
ix += 2;
i++;
}
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (max + 1);
max = izamax_kernel_16(n1, x, &maxf);
ix = n1 * 2;
i = n1;
} else { } else {
maxf = CABS1(x, 0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (max + 1);
} else {
max = 0; max = 0;
maxf = CABS1(x,0); maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n) BLASLONG n1 = n & -4;
{ while (i < n1) {
if( CABS1(x,ix) > maxf )
{ if (CABS1(x, ix) > maxf) {
max = i; max = i;
maxf = CABS1(x,ix); maxf = CABS1(x, ix);
} }
ix += inc_x2; if (CABS1(x, ix + inc_x2) > maxf) {
i++; max = i + 1;
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) > maxf) {
max = i + 2;
maxf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) > maxf) {
max = i + 3;
maxf = CABS1(x, ix + 3 * inc_x2);
}
ix += inc_x2 * 4;
i += 4;
} }
return (max + 1);
while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (max + 1);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
{
BLASLONG iamin;
__asm__ volatile ( static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) {
"vleg %%v0,0(%3),0 \n\t" BLASLONG iamin;
"vleg %%v1,8(%3),0 \n\t"
"vleg %%v0,16(%3),1 \n\t"
"vleg %%v1,24(%3),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v1,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,8 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"srlg %%r0,%2,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"
"vleg %%v16,0(%%r1,%3),0 \n\t" __asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v17,8(%%r1,%3),0 \n\t" "vleg %%v1,8(%[x]),0\n\t"
"vleg %%v16,16(%%r1,%3),1 \n\t" "vleg %%v0,16(%[x]),1\n\t"
"vleg %%v17,24(%%r1,%3),1 \n\t" "vleg %%v1,24(%[x]),1\n\t"
"vleg %%v18,32(%%r1,%3),0 \n\t" "vflpdb %%v0,%%v0\n\t"
"vleg %%v19,40(%%r1,%3),0 \n\t" "vflpdb %%v1,%%v1\n\t"
"vleg %%v18,48(%%r1,%3),1 \n\t" "vfadb %%v0,%%v0,%%v1\n\t"
"vleg %%v19,56(%%r1,%3),1 \n\t" "vleig %%v1,0,0\n\t"
"vleg %%v20,64(%%r1,%3),0 \n\t" "vleig %%v1,1,1\n\t"
"vleg %%v21,72(%%r1,%3),0 \n\t" "vrepig %%v2,8\n\t"
"vleg %%v20,80(%%r1,%3),1 \n\t" "vzero %%v3\n\t"
"vleg %%v21,88(%%r1,%3),1 \n\t" "vleig %%v24,0,0\n\t"
"vleg %%v22,96(%%r1,%3),0 \n\t" "vleig %%v24,1,1\n\t"
"vleg %%v23,104(%%r1,%3),0 \n\t" "vleig %%v25,2,0\n\t"
"vleg %%v22,112(%%r1,%3),1 \n\t" "vleig %%v25,3,1\n\t"
"vleg %%v23,120(%%r1,%3),1 \n\t" "vleig %%v26,4,0\n\t"
"vflpdb %%v16, %%v16 \n\t" "vleig %%v26,5,1\n\t"
"vflpdb %%v17, %%v17 \n\t" "vleig %%v27,6,0\n\t"
"vflpdb %%v18, %%v18 \n\t" "vleig %%v27,7,1\n\t"
"vflpdb %%v19, %%v19 \n\t" "srlg %[n],%[n],4\n\t"
"vflpdb %%v20, %%v20 \n\t" "xgr %%r1,%%r1\n\t"
"vflpdb %%v21, %%v21 \n\t" "0:\n\t"
"vflpdb %%v22, %%v22 \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vflpdb %%v23, %%v23 \n\t" "vleg %%v16,0(%%r1,%[x]),0\n\t"
"vfadb %%v16,%%v16,%%v17 \n\t" "vleg %%v17,8(%%r1,%[x]),0\n\t"
"vfadb %%v17,%%v18,%%v19 \n\t" "vleg %%v16,16(%%r1,%[x]),1\n\t"
"vfadb %%v18,%%v20,%%v21 \n\t" "vleg %%v17,24(%%r1,%[x]),1\n\t"
"vfadb %%v19,%%v22,%%v23 \n\t" "vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t" "vleg %%v18,48(%%r1,%[x]),1\n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t" "vleg %%v19,56(%%r1,%[x]),1\n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vleg %%v20,64(%%r1,%[x]),0\n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vleg %%v21,72(%%r1,%[x]),0\n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vleg %%v20,80(%%r1,%[x]),1\n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t" "vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t"
"vsteg %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t"
"1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%[amin]\n\t"
"vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t"
"nop"
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
"vfchedb %%v18,%%v17,%%v16 \n\t" return iamin;
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"vleg %%v16,128(%%r1,%3),0 \n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchedb %%v4,%%v17,%%v16 \n\t"
"vfchedb %%v5,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vfchedb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"
"vfchedb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v5 \n\t"
"vsel %%v1,%%v1,%%v4,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"std %%f0,%1 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return iamin;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
{ BLASLONG i = 0;
BLASLONG i = 0; BLASLONG ix = 0;
BLASLONG ix = 0; FLOAT minf = 0;
FLOAT minf = 0; BLASLONG min = 0;
BLASLONG min = 0; BLASLONG inc_x2;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(min); if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) {
BLASLONG n1 = n & -16; if (inc_x == 1) {
if (n1 > 0) {
min = izamin_kernel_16(n1, x, &minf); BLASLONG n1 = n & -16;
ix = n1 * 2; if (n1 > 0) {
i = n1;
}
else
{
minf = CABS1(x,0);
ix += 2;
i++;
}
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (min + 1);
min = izamin_kernel_16(n1, x, &minf);
ix = n1 * 2;
i = n1;
} else { } else {
minf = CABS1(x, 0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (min + 1);
} else {
min = 0; min = 0;
minf = CABS1(x,0); minf = CABS1(x, 0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n) BLASLONG n1 = n & -4;
{ while (i < n1) {
if( CABS1(x,ix) < minf )
{ if (CABS1(x, ix) < minf) {
min = i; min = i;
minf = CABS1(x,ix); minf = CABS1(x, ix);
} }
ix += inc_x2; if (CABS1(x, ix + inc_x2) < minf) {
i++; min = i + 1;
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) < minf) {
min = i + 2;
minf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) < minf) {
min = i + 3;
minf = CABS1(x, ix + 3 * inc_x2);
}
ix += inc_x2 * 4;
i += 4;
} }
return (min + 1);
while (i < n) {
if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
ix += inc_x2;
i++;
} }
return (min + 1);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif
static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) {
{ FLOAT amax;
FLOAT amax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],6\n\t"
"srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxsb %%v16,%%v16,%%v24,8\n\t"
"vfmaxsb %%v17,%%v17,%%v25,8\n\t"
"vfmaxsb %%v18,%%v18,%%v26,8\n\t"
"vfmaxsb %%v19,%%v19,%%v27,8\n\t"
"vfmaxsb %%v20,%%v20,%%v28,8\n\t"
"vfmaxsb %%v21,%%v21,%%v29,8\n\t"
"vfmaxsb %%v22,%%v22,%%v30,8\n\t"
"vfmaxsb %%v23,%%v23,%%v31,8\n\t"
"vfmaxsb %%v16,%%v16,%%v20,8\n\t"
"vfmaxsb %%v17,%%v17,%%v21,8\n\t"
"vfmaxsb %%v18,%%v18,%%v22,8\n\t"
"vfmaxsb %%v19,%%v19,%%v23,8\n\t"
"vfmaxsb %%v16,%%v16,%%v18,8\n\t"
"vfmaxsb %%v17,%%v17,%%v19,8\n\t"
"vfmaxsb %%v16,%%v16,%%v17,8\n\t"
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,8\n\t"
"lper %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return amax;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxsb %%v16,%%v16,%%v24,8 \n\t"
"vfmaxsb %%v17,%%v17,%%v25,8 \n\t"
"vfmaxsb %%v18,%%v18,%%v26,8 \n\t"
"vfmaxsb %%v19,%%v19,%%v27,8 \n\t"
"vfmaxsb %%v20,%%v20,%%v28,8 \n\t"
"vfmaxsb %%v21,%%v21,%%v29,8 \n\t"
"vfmaxsb %%v22,%%v22,%%v30,8 \n\t"
"vfmaxsb %%v23,%%v23,%%v31,8 \n\t"
"vfmaxsb %%v16,%%v16,%%v20,8 \n\t"
"vfmaxsb %%v17,%%v17,%%v21,8 \n\t"
"vfmaxsb %%v18,%%v18,%%v22,8 \n\t"
"vfmaxsb %%v19,%%v19,%%v23,8 \n\t"
"vfmaxsb %%v16,%%v16,%%v18,8 \n\t"
"vfmaxsb %%v17,%%v17,%%v19,8 \n\t"
"vfmaxsb %%v16,%%v16,%%v17,8 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,8 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,8 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfmaxsb %%v0,%%v0,%%v16,8 \n\t"
"lper %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amax;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
if (n1 > 0) { if (n1 > 0) {
maxf = samax_kernel_64(n1, x); maxf = samax_kernel_64(n1, x);
i = n1;
}
else
{
maxf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
i = n1;
} else { } else {
maxf = ABS(x[0]);
maxf=ABS(x[0]); i++;
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
} }
while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);
} else {
maxf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif
static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) {
{ FLOAT amin;
FLOAT amin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],6\n\t"
"srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfminsb %%v16,%%v16,%%v24,8\n\t"
"vfminsb %%v17,%%v17,%%v25,8\n\t"
"vfminsb %%v18,%%v18,%%v26,8\n\t"
"vfminsb %%v19,%%v19,%%v27,8\n\t"
"vfminsb %%v20,%%v20,%%v28,8\n\t"
"vfminsb %%v21,%%v21,%%v29,8\n\t"
"vfminsb %%v22,%%v22,%%v30,8\n\t"
"vfminsb %%v23,%%v23,%%v31,8\n\t"
"vfminsb %%v16,%%v16,%%v20,8\n\t"
"vfminsb %%v17,%%v17,%%v21,8\n\t"
"vfminsb %%v18,%%v18,%%v22,8\n\t"
"vfminsb %%v19,%%v19,%%v23,8\n\t"
"vfminsb %%v16,%%v16,%%v18,8\n\t"
"vfminsb %%v17,%%v17,%%v19,8\n\t"
"vfminsb %%v16,%%v16,%%v17,8\n\t"
"vfminsb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,8\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,8\n\t"
"lper %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return amin;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfminsb %%v16,%%v16,%%v24,8 \n\t"
"vfminsb %%v17,%%v17,%%v25,8 \n\t"
"vfminsb %%v18,%%v18,%%v26,8 \n\t"
"vfminsb %%v19,%%v19,%%v27,8 \n\t"
"vfminsb %%v20,%%v20,%%v28,8 \n\t"
"vfminsb %%v21,%%v21,%%v29,8 \n\t"
"vfminsb %%v22,%%v22,%%v30,8 \n\t"
"vfminsb %%v23,%%v23,%%v31,8 \n\t"
"vfminsb %%v16,%%v16,%%v20,8 \n\t"
"vfminsb %%v17,%%v17,%%v21,8 \n\t"
"vfminsb %%v18,%%v18,%%v22,8 \n\t"
"vfminsb %%v19,%%v19,%%v23,8 \n\t"
"vfminsb %%v16,%%v16,%%v18,8 \n\t"
"vfminsb %%v17,%%v17,%%v19,8 \n\t"
"vfminsb %%v16,%%v16,%%v17,8 \n\t"
"vfminsb %%v0,%%v0,%%v16,8 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfminsb %%v0,%%v0,%%v16,8 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfminsb %%v0,%%v0,%%v16,8 \n\t"
"lper %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amin;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
if (n1 > 0) { if (n1 > 0) {
minf = samin_kernel_64(n1, x); minf = samin_kernel_64(n1, x);
i = n1;
}
else
{
minf=ABS(x[0]);
i++;
}
while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
i = n1;
} else { } else {
minf = ABS(x[0]);
minf=ABS(x[0]); i++;
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
} }
while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);
} else {
minf = ABS(x[0]);
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,147 +28,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define ABS fabsf
#define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) {
{ FLOAT asum;
FLOAT asum;
__asm__ ( __asm__("vzero %%v24\n\t"
"vzero %%v0 \n\t" "vzero %%v25\n\t"
"vzero %%v1 \n\t" "vzero %%v26\n\t"
"vzero %%v2 \n\t" "vzero %%v27\n\t"
"vzero %%v3 \n\t" "vzero %%v28\n\t"
"srlg %%r0,%1,6 \n\t" "vzero %%v29\n\t"
"xgr %%r1,%%r1 \n\t" "vzero %%v30\n\t"
"0: \n\t" "vzero %%v31\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "srlg %[n],%[n],6\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "xgr %%r1,%%r1\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "0:\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%2) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t"
"vflpsb %%v19, %%v19\n\t"
"vflpsb %%v20, %%v20\n\t"
"vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vflpsb %%v16, %%v16 \n\t" return asum;
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v2 \n\t"
"vfasb %%v0,%%v0,%%v3 \n\t"
"veslg %%v1,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"ler %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
BLASLONG n1; BLASLONG n1;
if (n <= 0 || inc_x <= 0) return sumf; if (n <= 0 || inc_x <= 0)
return sumf;
if (inc_x == 1) { if (inc_x == 1) {
n1 = n & -64; n1 = n & -64;
if (n1 > 0) {
sumf = sasum_kernel_64(n1, x); if (n1 > 0) {
i = n1;
}
while (i < n) { sumf = sasum_kernel_64(n1, x);
sumf += ABS(x[i]); i = n1;
i++; }
}
} else { while (i < n) {
BLASLONG n1 = n & -4; sumf += ABS(x[i]);
register FLOAT sum1, sum2; i++;
sum1 = 0.0; }
sum2 = 0.0;
while (j < n1) {
sum1 += ABS(x[i]); } else {
sum2 += ABS(x[i + inc_x]); BLASLONG n1 = n & -4;
sum1 += ABS(x[i + 2 * inc_x]); register FLOAT sum1, sum2;
sum2 += ABS(x[i + 3 * inc_x]); sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {
i += inc_x * 4; sum1 += ABS(x[i]);
j += 4; sum2 += ABS(x[i + inc_x]);
sum1 += ABS(x[i + 2 * inc_x]);
} sum2 += ABS(x[i + 3 * inc_x]);
sumf = sum1 + sum2;
while (j < n) {
sumf += ABS(x[i]);
i += inc_x;
j++;
}
i += inc_x * 4;
j += 4;
} }
return sumf; sumf = sum1 + sum2;
while (j < n) {
sumf += ABS(x[i]);
i += inc_x;
j++;
}
}
return sumf;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
{ __asm__("vlrepf %%v0,%[alpha]\n\t"
__asm__ volatile( "srlg %[n],%[n],6\n\t"
"vlrepf %%v0,%3 \n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,6 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"0: \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"pfd 1, 1024(%%r1,%1) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%1) \n\t" "vl %%v20,0(%%r1,%[y])\n\t"
"vl %%v18,32(%%r1,%1) \n\t" "vl %%v21,16(%%r1,%[y])\n\t"
"vl %%v19,48(%%r1,%1) \n\t" "vl %%v22,32(%%r1,%[y])\n\t"
"vl %%v20,0(%%r1,%2) \n\t" "vl %%v23,48(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%2) \n\t" "vl %%v24,64(%%r1,%[x])\n\t"
"vl %%v22,32(%%r1,%2) \n\t" "vl %%v25,80(%%r1,%[x])\n\t"
"vl %%v23,48(%%r1,%2) \n\t" "vl %%v26,96(%%r1,%[x])\n\t"
"vl %%v27,112(%%r1,%[x])\n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" "vl %%v28,64(%%r1,%[y])\n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" "vl %%v29,80(%%r1,%[y])\n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" "vl %%v30,96(%%r1,%[y])\n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" "vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
"vl %%v24,64(%%r1,%1) \n\t" "vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
"vl %%v25,80(%%r1,%1) \n\t" "vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
"vl %%v26,96(%%r1,%1) \n\t" "vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
"vl %%v27,112(%%r1,%1) \n\t" "vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
"vl %%v28,64(%%r1,%2) \n\t" "vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
"vl %%v29,80(%%r1,%2) \n\t" "vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
"vl %%v30,96(%%r1,%2) \n\t" "vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
"vl %%v31,112(%%r1,%2) \n\t" "vst %%v16,0(%%r1,%[y])\n\t"
"vst %%v17,16(%%r1,%[y])\n\t"
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" "vst %%v18,32(%%r1,%[y])\n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" "vst %%v19,48(%%r1,%[y])\n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" "vst %%v24,64(%%r1,%[y])\n\t"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" "vst %%v25,80(%%r1,%[y])\n\t"
"vst %%v26,96(%%r1,%[y])\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vst %%v27,112(%%r1,%[y])\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vl %%v16,128(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%2) \n\t" "vl %%v20,128(%%r1,%[y])\n\t"
"vst %%v22,96(%%r1,%2) \n\t" "vl %%v21,144(%%r1,%[y])\n\t"
"vst %%v23,112(%%r1,%2) \n\t" "vl %%v22,160(%%r1,%[y])\n\t"
"vl %%v23,176(%%r1,%[y])\n\t"
"vl %%v16,128(%%r1,%1) \n\t" "vl %%v24,192(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%1) \n\t" "vl %%v25,208(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%1) \n\t" "vl %%v26,224(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%1) \n\t" "vl %%v27,240(%%r1,%[x])\n\t"
"vl %%v20,128(%%r1,%2) \n\t" "vl %%v28,192(%%r1,%[y])\n\t"
"vl %%v21,144(%%r1,%2) \n\t" "vl %%v29,208(%%r1,%[y])\n\t"
"vl %%v22,160(%%r1,%2) \n\t" "vl %%v30,224(%%r1,%[y])\n\t"
"vl %%v23,176(%%r1,%2) \n\t" "vl %%v31,240(%%r1,%[y])\n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" "vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" "vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" "vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" "vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
"vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
"vl %%v24,192(%%r1,%1) \n\t" "vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
"vl %%v25,208(%%r1,%1) \n\t" "vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
"vl %%v26,224(%%r1,%1) \n\t" "vst %%v16,128(%%r1,%[y])\n\t"
"vl %%v27,240(%%r1,%1) \n\t" "vst %%v17,144(%%r1,%[y])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vst %%v18,160(%%r1,%[y])\n\t"
"vl %%v29,208(%%r1,%2) \n\t" "vst %%v19,176(%%r1,%[y])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vst %%v24,192(%%r1,%[y])\n\t"
"vl %%v31,240(%%r1,%2) \n\t" "vst %%v25,208(%%r1,%[y])\n\t"
"vst %%v26,224(%%r1,%[y])\n\t"
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" "vst %%v27,240(%%r1,%[y])\n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" "agfi %%r1,256\n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" "brctg %[n],0b"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
"vst %%v16,128(%%r1,%2) \n\t" [alpha] "Q"(*alpha)
"vst %%v17,144(%%r1,%2) \n\t" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"vst %%v18,160(%%r1,%2) \n\t" "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
{ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG i=0; BLASLONG dummy2) {
BLASLONG ix=0,iy=0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
if ( n <= 0 ) return 0 ; if (n <= 0)
return 0;
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
if ( n1 ) if (n1)
saxpy_kernel_64(n1, x, y , &da); saxpy_kernel_64(n1, x, y, &da);
i = n1; i = n1;
while(i < n) while (i < n) {
{
y[i] += da * x[i] ;
i++ ;
}
return 0 ;
y[i] += da * x[i];
i++;
} }
return 0;
BLASLONG n1 = n & -4; }
while(i < n1) BLASLONG n1 = n & -4;
{
FLOAT m1 = da * x[ix] ; while (i < n1) {
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;
y[iy] += m1 ; FLOAT m1 = da * x[ix];
y[iy+inc_y] += m2 ; FLOAT m2 = da * x[ix + inc_x];
y[iy+2*inc_y] += m3 ; FLOAT m3 = da * x[ix + 2 * inc_x];
y[iy+3*inc_y] += m4 ; FLOAT m4 = da * x[ix + 3 * inc_x];
ix += inc_x*4 ; y[iy] += m1;
iy += inc_y*4 ; y[iy + inc_y] += m2;
i+=4 ; y[iy + 2 * inc_y] += m3;
y[iy + 3 * inc_y] += m4;
} ix += inc_x * 4;
iy += inc_y * 4;
i += 4;
while(i < n) }
{
y[iy] += da * x[ix] ; while (i < n) {
ix += inc_x ;
iy += inc_y ;
i++ ;
} y[iy] += da * x[ix];
return 0 ; ix += inc_x;
iy += inc_y;
i++;
}
return 0;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],6\n\t"
__asm__ volatile ( "0:\n\t"
"lgr %%r1,%1 \n\t" "pfd 1, 1024(%[x])\n\t"
"lgr %%r2,%2 \n\t" "pfd 2, 1024(%[y])\n\t"
"srlg %%r0,%0,6 \n\t" "mvc 0(256,%[y]),0(%[x])\n\t"
"0: \n\t" "la %[x],256(%[x])\n\t"
"pfd 1, 1024(%%r1) \n\t" "la %[y],256(%[y])\n\t"
"pfd 2, 1024(%%r2) \n\t" "brctg %[n],0b"
"mvc 0(256,%%r2),0(%%r1) \n\t" : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
"agfi %%r1,256 \n\t" : "m"(*(const struct { FLOAT x[n]; } *) x)
"agfi %%r2,256 \n\t" : "cc");
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","r2"
);
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
if (n <= 0) return 0; if (n <= 0)
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
scopy_kernel_64(n1, x, y);
i = n1;
}
while (i < n) {
y[i] = x[i];
i++;
}
} else {
while (i < n) {
y[iy] = x[ix];
ix += inc_x;
iy += inc_y;
i++;
}
}
return 0; return 0;
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -64;
if (n1 > 0) {
scopy_kernel_64(n1, x, y);
i = n1;
}
while (i < n) {
y[i] = x[i];
i++;
}
} else {
while (i < n) {
y[iy] = x[ix];
ix += inc_x;
iy += inc_y;
i++;
}
}
return 0;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project Copyright (c) 2013-2019,The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms,with or without Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are modification,are permitted provided that the following conditions are
@ -27,114 +27,118 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
{ FLOAT dot;
FLOAT dot;
__asm__ volatile ( __asm__("vzero %%v0\n\t"
"vzero %%v0 \n\t" "vzero %%v1\n\t"
"srlg %%r0,%1,5 \n\t" "vzero %%v2\n\t"
"xgr %%r1,%%r1 \n\t" "vzero %%v3\n\t"
"0: \n\t" "vzero %%v4\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "vzero %%v5\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vfmasb %%v1,%%v17,%%v25,%%v1\n\t"
"vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
"vfmasb %%v3,%%v19,%%v27,%%v3\n\t"
"vfmasb %%v4,%%v20,%%v28,%%v4\n\t"
"vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
"vfmasb %%v6,%%v22,%%v30,%%v6\n\t"
"vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vfasb %%v0,%%v0,%%v2\n\t"
"vfasb %%v0,%%v0,%%v3\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vfasb %%v0,%%v0,%%v5\n\t"
"vfasb %%v0,%%v0,%%v6\n\t"
"vfasb %%v0,%%v0,%%v7\n\t"
"vrepf %%v1,%%v0,1\n\t"
"vrepf %%v2,%%v0,2\n\t"
"vrepf %%v3,%%v0,3\n\t"
"aebr %%f0,%%f1\n\t"
"aebr %%f0,%%f2\n\t"
"aebr %%f0,%%f3\n\t"
"ler %[dot],%%f0"
: [dot] "=f"(dot),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return dot;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,0(%%r1,%3) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepf %%v1,%%v0,1 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepf %%v3,%%v0,3 \n\t"
"aebr %%f0,%%f1 \n\t"
"aebr %%f0,%%f2 \n\t"
"aebr %%f0,%%f3 \n\t"
"ler %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return dot;
} }
FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
{ BLASLONG i = 0;
BLASLONG i=0; BLASLONG ix = 0, iy = 0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ; FLOAT dot = 0.0;
if ( n <= 0 ) return(dot); if (n <= 0)
return (dot);
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 ) if (n1)
dot = sdot_kernel_32(n1,x,y); dot = sdot_kernel_32(n1, x, y);
i = n1; i = n1;
while(i < n) while (i < n) {
{
dot += y[i] * x[i] ; dot += y[i] * x[i];
i++ ; i++;
} }
return(dot); return (dot);
}
} BLASLONG n1 = n & -2;
BLASLONG n1 = n & -2; while (i < n1) {
while(i < n1) dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x];
{ ix += inc_x * 2;
iy += inc_y * 2;
i += 2;
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; }
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
} while (i < n) {
while(i < n) dot += y[iy] * x[ix];
{ ix += inc_x;
iy += inc_y;
i++;
dot += y[iy] * x[ix] ; }
ix += inc_x ; return (dot);
iy += inc_y ;
i++ ;
}
return(dot);
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,136 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) {
{ FLOAT max;
FLOAT max;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],6\n\t"
"srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
"vfmaxsb %%v17,%%v17,%%v25,0\n\t"
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
"vfmaxsb %%v19,%%v19,%%v27,0\n\t"
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
"vfmaxsb %%v21,%%v21,%%v29,0\n\t"
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
"vfmaxsb %%v23,%%v23,%%v31,0\n\t"
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
"vfmaxsb %%v17,%%v17,%%v21,0\n\t"
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
"vfmaxsb %%v19,%%v19,%%v23,0\n\t"
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
"vfmaxsb %%v17,%%v17,%%v19,0\n\t"
"vfmaxsb %%v16,%%v16,%%v17,0\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
"ler %[max],%%f0"
: [max] "=f"(max),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return max;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxsb %%v16,%%v16,%%v24,0 \n\t"
"vfmaxsb %%v17,%%v17,%%v25,0 \n\t"
"vfmaxsb %%v18,%%v18,%%v26,0 \n\t"
"vfmaxsb %%v19,%%v19,%%v27,0 \n\t"
"vfmaxsb %%v20,%%v20,%%v28,0 \n\t"
"vfmaxsb %%v21,%%v21,%%v29,0 \n\t"
"vfmaxsb %%v22,%%v22,%%v30,0 \n\t"
"vfmaxsb %%v23,%%v23,%%v31,0 \n\t"
"vfmaxsb %%v16,%%v16,%%v20,0 \n\t"
"vfmaxsb %%v17,%%v17,%%v21,0 \n\t"
"vfmaxsb %%v18,%%v18,%%v22,0 \n\t"
"vfmaxsb %%v19,%%v19,%%v23,0 \n\t"
"vfmaxsb %%v16,%%v16,%%v18,0 \n\t"
"vfmaxsb %%v17,%%v17,%%v19,0 \n\t"
"vfmaxsb %%v16,%%v16,%%v17,0 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfmaxsb %%v0,%%v0,%%v16,0 \n\t"
"ler %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return max;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
if (n1 > 0) { if (n1 > 0) {
maxf = smax_kernel_64(n1, x); maxf = smax_kernel_64(n1, x);
i = n1;
}
else
{
maxf=x[0];
i++;
}
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
i = n1;
} else { } else {
maxf = x[0];
maxf=x[0]; i++;
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
} }
while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);
} else {
maxf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,136 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) {
{ FLOAT min;
FLOAT min;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],6\n\t"
"srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1\n\t"
"xgr %%r1,%%r1 \n\t" "0:\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%[x])\n\t"
"vfminsb %%v16,%%v16,%%v24,0\n\t"
"vfminsb %%v17,%%v17,%%v25,0\n\t"
"vfminsb %%v18,%%v18,%%v26,0\n\t"
"vfminsb %%v19,%%v19,%%v27,0\n\t"
"vfminsb %%v20,%%v20,%%v28,0\n\t"
"vfminsb %%v21,%%v21,%%v29,0\n\t"
"vfminsb %%v22,%%v22,%%v30,0\n\t"
"vfminsb %%v23,%%v23,%%v31,0\n\t"
"vfminsb %%v16,%%v16,%%v20,0\n\t"
"vfminsb %%v17,%%v17,%%v21,0\n\t"
"vfminsb %%v18,%%v18,%%v22,0\n\t"
"vfminsb %%v19,%%v19,%%v23,0\n\t"
"vfminsb %%v16,%%v16,%%v18,0\n\t"
"vfminsb %%v17,%%v17,%%v19,0\n\t"
"vfminsb %%v16,%%v16,%%v17,0\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,0\n\t"
"ler %[min],%%f0"
: [min] "=f"(min),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vl %%v16,0(%%r1,%2) \n\t" return min;
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vl %%v24,128(%%r1,%2) \n\t"
"vl %%v25,144(%%r1,%2) \n\t"
"vl %%v26,160(%%r1,%2) \n\t"
"vl %%v27,176(%%r1,%2) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfminsb %%v16,%%v16,%%v24,0 \n\t"
"vfminsb %%v17,%%v17,%%v25,0 \n\t"
"vfminsb %%v18,%%v18,%%v26,0 \n\t"
"vfminsb %%v19,%%v19,%%v27,0 \n\t"
"vfminsb %%v20,%%v20,%%v28,0 \n\t"
"vfminsb %%v21,%%v21,%%v29,0 \n\t"
"vfminsb %%v22,%%v22,%%v30,0 \n\t"
"vfminsb %%v23,%%v23,%%v31,0 \n\t"
"vfminsb %%v16,%%v16,%%v20,0 \n\t"
"vfminsb %%v17,%%v17,%%v21,0 \n\t"
"vfminsb %%v18,%%v18,%%v22,0 \n\t"
"vfminsb %%v19,%%v19,%%v23,0 \n\t"
"vfminsb %%v16,%%v16,%%v18,0 \n\t"
"vfminsb %%v17,%%v17,%%v19,0 \n\t"
"vfminsb %%v16,%%v16,%%v17,0 \n\t"
"vfminsb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"veslg %%v16,%%v0,32 \n\t"
"vfminsb %%v0,%%v0,%%v16,0 \n\t"
"vrepf %%v16,%%v0,2 \n\t"
"wfminsb %%v0,%%v0,%%v16,0 \n\t"
"ler %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return min;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
if (n1 > 0) { if (n1 > 0) {
minf = smin_kernel_64(n1, x); minf = smin_kernel_64(n1, x);
i = n1;
}
else
{
minf=x[0];
i++;
}
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
i = n1;
} else { } else {
minf = x[0];
minf=x[0]; i++;
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
} }
while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);
} else {
minf = x[0];
BLASLONG n1 = n & -4;
while (j < n1) {
if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,220 +27,200 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
{ __asm__("vlrepf %%v0,%[c]\n\t"
__asm__ ( "vlrepf %%v1,%[s]\n\t"
"vlrepf %%v0,%3 \n\t" "srlg %[n],%[n],6\n\t"
"vlrepf %%v1,%4 \n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,6 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"0: \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "vl %%v24, 0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v24, 0(%%r1,%1) \n\t" "vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%1) \n\t" "vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%1) \n\t" "vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v27, 48(%%r1,%1) \n\t" "vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmsb %%v28,%%v24,%%v0 \n\t" "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmsb %%v29,%%v25,%%v0 \n\t" "vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v30,%%v26,%%v0 \n\t" "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ /* 2nd parts */
"vfmsb %%v31,%%v27,%%v0 \n\t" "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v28, 0(%%r1,%[x])\n\t"
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v29, 16(%%r1,%[x])\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v28, 0(%%r1,%1) \n\t" "vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v29, 16(%%r1,%1) \n\t" "vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v30, 32(%%r1,%1) \n\t" "vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v31, 48(%%r1,%1) \n\t" "vst %%v23, 48(%%r1,%[y])\n\t"
"vst %%v20, 0(%%r1,%2) \n\t" "vl %%v24, 64(%%r1,%[x])\n\t"
"vst %%v21, 16(%%r1,%2) \n\t" "vl %%v25, 80(%%r1,%[x])\n\t"
"vst %%v22, 32(%%r1,%2) \n\t" "vl %%v26, 96(%%r1,%[x])\n\t"
"vst %%v23, 48(%%r1,%2) \n\t" "vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%1) \n\t" "vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v25, 80(%%r1,%1) \n\t" "vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v26, 96(%%r1,%1) \n\t" "vl %%v19, 112(%%r1,%[y])\n\t"
"vl %%v27, 112(%%r1,%1) \n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vl %%v16, 64(%%r1,%2) \n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vl %%v17, 80(%%r1,%2) \n\t" "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vl %%v18, 96(%%r1,%2) \n\t" "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vl %%v19, 112(%%r1,%2) \n\t" "vfmsb %%v30,%%v26,%%v0\n\t"
"vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmsb %%v28,%%v24,%%v0 \n\t" "vfmsb %%v31,%%v27,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t" "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ /* 2nd parts */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsb %%v30,%%v26,%%v0 \n\t" "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsb %%v31,%%v27,%%v0 \n\t" "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
/* 2nd parts*/ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" "vst %%v28, 64(%%r1,%[x])\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vst %%v29, 80(%%r1,%[x])\n\t"
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v30, 96(%%r1,%[x])\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v31, 112(%%r1,%[x])\n\t"
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v28, 64(%%r1,%1) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v29, 80(%%r1,%1) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%1) \n\t" "vl %%v25, 144(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vl %%v16, 128(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vl %%v17, 144(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vl %%v27, 176(%%r1,%1) \n\t" "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vl %%v16, 128(%%r1,%2) \n\t" "vfmsb %%v30,%%v26,%%v0\n\t"
"vl %%v17, 144(%%r1,%2) \n\t" "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vl %%v18, 160(%%r1,%2) \n\t" "vfmsb %%v31,%%v27,%%v0\n\t"
"vl %%v19, 176(%%r1,%2) \n\t" "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmsb %%v28,%%v24,%%v0 \n\t" "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t" "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmsb %%v30,%%v26,%%v0 \n\t" "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmsb %%v31,%%v27,%%v0 \n\t" "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vst %%v28, 128(%%r1,%[x])\n\t"
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" "vst %%v29, 144(%%r1,%[x])\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vst %%v30, 160(%%r1,%[x])\n\t"
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" "vst %%v31, 176(%%r1,%[x])\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vst %%v20, 128(%%r1,%[y])\n\t"
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v21, 144(%%r1,%[y])\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v22, 160(%%r1,%[y])\n\t"
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v23, 176(%%r1,%[y])\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vst %%v28, 128(%%r1,%1) \n\t" "vl %%v26, 224(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%1) \n\t" "vl %%v27, 240(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%1) \n\t" "vl %%v16, 192(%%r1,%[y])\n\t"
"vst %%v31, 176(%%r1,%1) \n\t" "vl %%v17, 208(%%r1,%[y])\n\t"
"vst %%v20, 128(%%r1,%2) \n\t" "vl %%v18, 224(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%2) \n\t" "vl %%v19, 240(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%2) \n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vst %%v23, 176(%%r1,%2) \n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vl %%v24, 192(%%r1,%1) \n\t" "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vl %%v25, 208(%%r1,%1) \n\t" "vfmsb %%v30,%%v26,%%v0\n\t"
"vl %%v26, 224(%%r1,%1) \n\t" "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vl %%v27, 240(%%r1,%1) \n\t" "vfmsb %%v31,%%v27,%%v0\n\t"
"vl %%v16, 192(%%r1,%2) \n\t" "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
"vl %%v17, 208(%%r1,%2) \n\t" /* 2nd parts */
"vl %%v18, 224(%%r1,%2) \n\t" "vfmasb %%v28,%%v16,%%v1,%%v28\n\t"
"vl %%v19, 240(%%r1,%2) \n\t" "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t" "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmsb %%v29,%%v25,%%v0 \n\t" "vfmasb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsb %%v30,%%v26,%%v0 \n\t" "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vst %%v28, 192(%%r1,%[x])\n\t"
"vfmsb %%v31,%%v27,%%v0 \n\t" "vst %%v29, 208(%%r1,%[x])\n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vst %%v30, 224(%%r1,%[x])\n\t"
/* 2nd parts*/ "vst %%v31, 240(%%r1,%[x])\n\t"
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" "vst %%v20, 192(%%r1,%[y])\n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vst %%v21, 208(%%r1,%[y])\n\t"
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" "vst %%v22, 224(%%r1,%[y])\n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vst %%v23, 240(%%r1,%[y])\n\t"
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" "agfi %%r1,256\n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "brctg %[n],0b"
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ [n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
"vst %%v28, 192(%%r1,%1) \n\t" : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"vst %%v29, 208(%%r1,%1) \n\t" "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"vst %%v30, 224(%%r1,%1) \n\t" "v31");
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
{ FLOAT c, FLOAT s) {
BLASLONG i=0; BLASLONG i = 0;
BLASLONG ix=0,iy=0; BLASLONG ix = 0, iy = 0;
FLOAT temp;
if ( n <= 0 ) return(0); FLOAT temp;
if ( (inc_x == 1) && (inc_y == 1) ) if (n <= 0)
{ return (0);
BLASLONG n1 = n & -64; if ((inc_x == 1) && (inc_y == 1)) {
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
srot_kernel_64(n1, x, y, &cosa, &sina);
i=n1;
}
while(i < n) BLASLONG n1 = n & -64;
{ if (n1 > 0) {
temp = c*x[i] + s*y[i] ; FLOAT cosa, sina;
y[i] = c*y[i] - s*x[i] ; cosa = c;
x[i] = temp ; sina = s;
srot_kernel_64(n1, x, y, &cosa, &sina);
i = n1;
}
i++ ; while (i < n) {
temp = c * x[i] + s * y[i];
} y[i] = c * y[i] - s * x[i];
x[i] = temp;
i++;
} }
else
{
while(i < n) } else {
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;
ix += inc_x ; while (i < n) {
iy += inc_y ; temp = c * x[ix] + s * y[iy];
i++ ; y[iy] = c * y[iy] - s * x[ix];
x[ix] = temp;
} ix += inc_x;
iy += inc_y;
i++;
} }
return(0);
}
return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,175 +27,147 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) {
{ __asm__("vlrepf %%v0,%[da]\n\t"
__asm__ volatile ( "srlg %[n],%[n],5\n\t"
"vlrepf %%v0,%1 \n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,5 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"0: \n\t" "vl %%v24,0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vfmsb %%v24,%%v24,%%v0\n\t"
"vl %%v24, 0(%%r1,%2) \n\t" "vst %%v24,0(%%r1,%[x])\n\t"
"vfmsb %%v24,%%v24,%%v0 \n\t" "vl %%v25,16(%%r1,%[x])\n\t"
"vst %%v24, 0(%%r1,%2) \n\t" "vfmsb %%v25,%%v25,%%v0\n\t"
"vl %%v25, 16(%%r1,%2) \n\t" "vst %%v25,16(%%r1,%[x])\n\t"
"vfmsb %%v25,%%v25,%%v0 \n\t" "vl %%v26,32(%%r1,%[x])\n\t"
"vst %%v25, 16(%%r1,%2) \n\t" "vfmsb %%v26,%%v26,%%v0\n\t"
"vl %%v26, 32(%%r1,%2) \n\t" "vst %%v26,32(%%r1,%[x])\n\t"
"vfmsb %%v26,%%v26,%%v0 \n\t" "vl %%v27,48(%%r1,%[x])\n\t"
"vst %%v26, 32(%%r1,%2) \n\t" "vfmsb %%v27,%%v27,%%v0\n\t"
"vl %%v27, 48(%%r1,%2) \n\t" "vst %%v27,48(%%r1,%[x])\n\t"
"vfmsb %%v27,%%v27,%%v0 \n\t" "vl %%v28,64(%%r1,%[x])\n\t"
"vst %%v27, 48(%%r1,%2) \n\t" "vfmsb %%v28,%%v28,%%v0\n\t"
"vl %%v24, 64(%%r1,%2) \n\t" "vst %%v28,64(%%r1,%[x])\n\t"
"vfmsb %%v24,%%v24,%%v0 \n\t" "vl %%v29,80(%%r1,%[x])\n\t"
"vst %%v24, 64(%%r1,%2) \n\t" "vfmsb %%v29,%%v29,%%v0\n\t"
"vl %%v25, 80(%%r1,%2) \n\t" "vst %%v29,80(%%r1,%[x])\n\t"
"vfmsb %%v25,%%v25,%%v0 \n\t" "vl %%v30,96(%%r1,%[x])\n\t"
"vst %%v25, 80(%%r1,%2) \n\t" "vfmsb %%v30,%%v30,%%v0\n\t"
"vl %%v26, 96(%%r1,%2) \n\t" "vst %%v30,96(%%r1,%[x])\n\t"
"vfmsb %%v26,%%v26,%%v0 \n\t" "vl %%v31,112(%%r1,%[x])\n\t"
"vst %%v26, 96(%%r1,%2) \n\t" "vfmsb %%v31,%%v31,%%v0\n\t"
"vl %%v27, 112(%%r1,%2) \n\t" "vst %%v31,112(%%r1,%[x])\n\t"
"vfmsb %%v27,%%v27,%%v0 \n\t" "agfi %%r1,128\n\t"
"vst %%v27, 112(%%r1,%2) \n\t" "brctg %[n],0b"
"agfi %%r1,128 \n\t" : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
"brctg %%r0,0b " : [x] "a"(x),[da] "Q"(da)
: : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) "v31");
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
);
} }
static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) {
{ __asm__("vzero %%v0\n\t"
__asm__ volatile( "srlg %[n],%[n],5\n\t"
"vzero %%v24 \n\t" "xgr %%r1,%%r1\n\t"
"vzero %%v25 \n\t" "0:\n\t"
"vzero %%v26 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vzero %%v27 \n\t" "vst %%v0,0(%%r1,%[x])\n\t"
"srlg %%r0,%0,5 \n\t" "vst %%v0,16(%%r1,%[x])\n\t"
"xgr %%r1,%%r1 \n\t" "vst %%v0,32(%%r1,%[x])\n\t"
"0: \n\t" "vst %%v0,48(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v24,0(%%r1,%1) \n\t" "vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v25,16(%%r1,%1) \n\t" "vst %%v0,112(%%r1,%[x])\n\t"
"vst %%v26,32(%%r1,%1) \n\t" "agfi %%r1,128\n\t"
"vst %%v27,48(%%r1,%1) \n\t" "brctg %[n],0b"
"vst %%v24,64(%%r1,%1) \n\t" : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
"vst %%v25,80(%%r1,%1) \n\t" : [x] "a"(x)
"vst %%v26,96(%%r1,%1) \n\t" : "cc", "r1", "v0");
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
{ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG i=0,j=0; BLASLONG dummy2) {
if ( n <= 0 || inc_x <=0 ) BLASLONG i = 0, j = 0;
return(0); if (n <= 0 || inc_x <= 0)
return (0);
if (inc_x == 1) {
if ( inc_x == 1 )
{
if ( da == 0.0 ) if (da == 0.0) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{
sscal_kernel_32_zero(n1, x);
j=n1;
}
while(j < n) sscal_kernel_32_zero(n1, x);
{ j = n1;
}
x[j]=0.0; while (j < n) {
j++;
}
} x[j] = 0.0;
else j++;
{ }
BLASLONG n1 = n & -32; } else {
if ( n1 > 0 )
{
sscal_kernel_32(n1, da, x);
j=n1;
}
while(j < n)
{
x[j] = da * x[j] ;
j++;
}
}
BLASLONG n1 = n & -32;
if (n1 > 0) {
sscal_kernel_32(n1, da, x);
j = n1;
}
while (j < n) {
x[j] = da * x[j];
j++;
}
} }
else
{
if ( da == 0.0 ) } else {
{
BLASLONG n1 = n & -2; if (da == 0.0) {
while (j < n1) { BLASLONG n1 = n & -2;
x[i]=0.0; while (j < n1) {
x[i + inc_x]=0.0;
i += inc_x * 2; x[i] = 0.0;
j += 2; x[i + inc_x] = 0.0;
} i += inc_x * 2;
while(j < n) j += 2;
{
x[i]=0.0; }
i += inc_x ; while (j < n) {
j++;
}
} x[i] = 0.0;
else i += inc_x;
{ j++;
BLASLONG n1 = n & -2; }
while (j < n1) { } else {
BLASLONG n1 = n & -2;
x[i] = da * x[i] ; while (j < n1) {
x[i + inc_x] = da * x[i + inc_x];
i += inc_x * 2; x[i] = da * x[i];
j += 2; x[i + inc_x] = da * x[i + inc_x];
} i += inc_x * 2;
j += 2;
while(j < n) }
{
x[i] = da * x[i] ; while (j < n) {
i += inc_x ;
j++;
}
}
x[i] = da * x[i];
i += inc_x;
j++;
}
} }
return 0;
}
return 0;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,138 +27,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],6\n\t"
__asm__ volatile( "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,6 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"0: \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "vl %%v16, 0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%1) \n\t" "vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%1) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%1) \n\t" "vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%1) \n\t" "vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%1) \n\t" "vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%1) \n\t" "vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v30, 224(%%r1,%1) \n\t" "vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v31, 240(%%r1,%1) \n\t" "vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v0, 0(%%r1,%2) \n\t" "vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%2) \n\t" "vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%2) \n\t" "vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%2) \n\t" "vl %%v7, 112(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%2) \n\t" "vst %%v0, 0(%%r1,%[x])\n\t"
"vl %%v5, 80(%%r1,%2) \n\t" "vst %%v1, 16(%%r1,%[x])\n\t"
"vl %%v6, 96(%%r1,%2) \n\t" "vst %%v2, 32(%%r1,%[x])\n\t"
"vl %%v7, 112(%%r1,%2) \n\t" "vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v0, 0(%%r1,%1) \n\t" "vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%1) \n\t" "vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%1) \n\t" "vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%1) \n\t" "vst %%v7, 112(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%1) \n\t" "vl %%v0, 128(%%r1,%[y])\n\t"
"vst %%v5, 80(%%r1,%1) \n\t" "vl %%v1, 144(%%r1,%[y])\n\t"
"vst %%v6, 96(%%r1,%1) \n\t" "vl %%v2, 160(%%r1,%[y])\n\t"
"vst %%v7, 112(%%r1,%1) \n\t" "vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v0, 128(%%r1,%2) \n\t" "vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%2) \n\t" "vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%2) \n\t" "vl %%v7, 240(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%2) \n\t" "vst %%v0, 128(%%r1,%[x])\n\t"
"vl %%v4, 192(%%r1,%2) \n\t" "vst %%v1, 144(%%r1,%[x])\n\t"
"vl %%v5, 208(%%r1,%2) \n\t" "vst %%v2, 160(%%r1,%[x])\n\t"
"vl %%v6, 224(%%r1,%2) \n\t" "vst %%v3, 176(%%r1,%[x])\n\t"
"vl %%v7, 240(%%r1,%2) \n\t" "vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v0, 128(%%r1,%1) \n\t" "vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%1) \n\t" "vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%1) \n\t" "vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%1) \n\t" "vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v4, 192(%%r1,%1) \n\t" "vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v5, 208(%%r1,%1) \n\t" "vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v6, 224(%%r1,%1) \n\t" "vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v7, 240(%%r1,%1) \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v16, 0(%%r1,%2) \n\t" "vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%2) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%2) \n\t" "vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%2) \n\t" "vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%2) \n\t" "vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%2) \n\t" "vst %%v31, 240(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%2) \n\t" "agfi %%r1,256\n\t"
"vst %%v27, 176(%%r1,%2) \n\t" "brctg %[n],0b"
"vst %%v28, 192(%%r1,%2) \n\t" : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
"vst %%v29, 208(%%r1,%2) \n\t" [n] "+&r"(n)
"vst %%v30, 224(%%r1,%2) \n\t" : [x] "a"(x),[y] "a"(y)
"vst %%v31, 240(%%r1,%2) \n\t" : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"agfi %%r1,256 \n\t" "v27", "v28", "v29", "v30", "v31");
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
{ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG i=0; BLASLONG dummy2) {
BLASLONG ix=0,iy=0; BLASLONG i = 0;
FLOAT temp; BLASLONG ix = 0, iy = 0;
FLOAT temp;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 )) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
if ( n1 > 0 ) if (n1 > 0) {
{ sswap_kernel_64(n1, x, y);
sswap_kernel_64(n1, x, y); i = n1;
i=n1; }
}
while(i < n)
{
temp = y[i];
y[i] = x[i] ;
x[i] = temp;
i++ ;
}
while (i < n) {
temp = y[i];
y[i] = x[i];
x[i] = temp;
i++;
} }
else
{
while(i < n) } else {
{
temp = y[iy];
y[iy] = x[ix] ;
x[ix] = temp;
ix += inc_x ;
iy += inc_y ;
i++ ;
} while (i < n) {
temp = y[iy];
y[iy] = x[ix];
x[ix] = temp;
ix += inc_x;
iy += inc_y;
i++;
} }
return(0);
}
return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amax;
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) __asm__("vleg %%v0,0(%[x]),0\n\t"
{ "vleg %%v16,8(%[x]),0\n\t"
FLOAT amax; "vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v24,128(%%r1,%[x]),0\n\t"
"vleg %%v25,136(%%r1,%[x]),0\n\t"
"vleg %%v24,144(%%r1,%[x]),1\n\t"
"vleg %%v25,152(%%r1,%[x]),1\n\t"
"vleg %%v26,160(%%r1,%[x]),0\n\t"
"vleg %%v27,168(%%r1,%[x]),0\n\t"
"vleg %%v26,176(%%r1,%[x]),1\n\t"
"vleg %%v27,184(%%r1,%[x]),1\n\t"
"vleg %%v28,192(%%r1,%[x]),0\n\t"
"vleg %%v29,200(%%r1,%[x]),0\n\t"
"vleg %%v28,208(%%r1,%[x]),1\n\t"
"vleg %%v29,216(%%r1,%[x]),1\n\t"
"vleg %%v30,224(%%r1,%[x]),0\n\t"
"vleg %%v31,232(%%r1,%[x]),0\n\t"
"vleg %%v30,240(%%r1,%[x]),1\n\t"
"vleg %%v31,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16,%%v16\n\t"
"vflpdb %%v17,%%v17\n\t"
"vflpdb %%v18,%%v18\n\t"
"vflpdb %%v19,%%v19\n\t"
"vflpdb %%v20,%%v20\n\t"
"vflpdb %%v21,%%v21\n\t"
"vflpdb %%v22,%%v22\n\t"
"vflpdb %%v23,%%v23\n\t"
"vflpdb %%v24,%%v24\n\t"
"vflpdb %%v25,%%v25\n\t"
"vflpdb %%v26,%%v26\n\t"
"vflpdb %%v27,%%v27\n\t"
"vflpdb %%v28,%%v28\n\t"
"vflpdb %%v29,%%v29\n\t"
"vflpdb %%v30,%%v30\n\t"
"vflpdb %%v31,%%v31\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v18,%%v18,%%v19\n\t"
"vfadb %%v20,%%v20,%%v21\n\t"
"vfadb %%v22,%%v22,%%v23\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v26,%%v26,%%v27\n\t"
"vfadb %%v28,%%v28,%%v29\n\t"
"vfadb %%v30,%%v30,%%v31\n\t"
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
"vfmaxdb %%v20,%%v20,%%v28,0\n\t"
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
__asm__ volatile ( return amax;
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vleg %%v24,128(%%r1,%2),0 \n\t"
"vleg %%v25,136(%%r1,%2),0 \n\t"
"vleg %%v24,144(%%r1,%2),1 \n\t"
"vleg %%v25,152(%%r1,%2),1 \n\t"
"vleg %%v26,160(%%r1,%2),0 \n\t"
"vleg %%v27,168(%%r1,%2),0 \n\t"
"vleg %%v26,176(%%r1,%2),1 \n\t"
"vleg %%v27,184(%%r1,%2),1 \n\t"
"vleg %%v28,192(%%r1,%2),0 \n\t"
"vleg %%v29,200(%%r1,%2),0 \n\t"
"vleg %%v28,208(%%r1,%2),1 \n\t"
"vleg %%v29,216(%%r1,%2),1 \n\t"
"vleg %%v30,224(%%r1,%2),0 \n\t"
"vleg %%v31,232(%%r1,%2),0 \n\t"
"vleg %%v30,240(%%r1,%2),1 \n\t"
"vleg %%v31,248(%%r1,%2),1 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vflpdb %%v17,%%v17 \n\t"
"vflpdb %%v18,%%v18 \n\t"
"vflpdb %%v19,%%v19 \n\t"
"vflpdb %%v20,%%v20 \n\t"
"vflpdb %%v21,%%v21 \n\t"
"vflpdb %%v22,%%v22 \n\t"
"vflpdb %%v23,%%v23 \n\t"
"vflpdb %%v24,%%v24 \n\t"
"vflpdb %%v25,%%v25 \n\t"
"vflpdb %%v26,%%v26 \n\t"
"vflpdb %%v27,%%v27 \n\t"
"vflpdb %%v28,%%v28 \n\t"
"vflpdb %%v29,%%v29 \n\t"
"vflpdb %%v30,%%v30 \n\t"
"vflpdb %%v31,%%v31 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v18,%%v18,%%v19 \n\t"
"vfadb %%v20,%%v20,%%v21 \n\t"
"vfadb %%v22,%%v22,%%v23 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfmaxdb %%v16,%%v16,%%v24,0 \n\t"
"vfmaxdb %%v18,%%v18,%%v26,0 \n\t"
"vfmaxdb %%v20,%%v20,%%v28,0 \n\t"
"vfmaxdb %%v22,%%v22,%%v30,0 \n\t"
"vfmaxdb %%v16,%%v16,%%v20,0 \n\t"
"vfmaxdb %%v18,%%v18,%%v22,0 \n\t"
"vfmaxdb %%v16,%%v16,%%v18,0 \n\t"
"vfmaxdb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfmaxdb %%v0,%%v0,%%v16,0 \n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amax;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0; BLASLONG ix = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if (n1 > 0) { if (n1 > 0) {
maxf = zamax_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
maxf=CABS1(x,0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (maxf);
maxf = zamax_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
} else { } else {
maxf = CABS1(x, 0);
maxf=CABS1(x,0); ix += 2;
inc_x2 = 2 * inc_x; i++;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) > maxf) {
maxf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) > maxf) {
maxf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) > maxf) {
maxf = CABS1(x,ix+inc_x2*3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (maxf);
} }
while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (maxf);
} else {
maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) > maxf) {
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += inc_x2;
i++;
}
return (maxf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amax;
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) __asm__("vleg %%v0,0(%[x]),0\n\t"
{ "vleg %%v16,8(%[x]),0\n\t"
FLOAT amax; "vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v24,%%v25\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v26,%%v0\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v24,%%v25\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v26,%%v0\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amax],%%f0"
: [amax] "=f"(amax),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27");
__asm__ volatile ( return amax;
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchdb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchdb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"vleg %%v16,128(%%r1,%2),0 \n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchdb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchdb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return amax;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0; BLASLONG ix = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if (n1 > 0) { if (n1 > 0) {
maxf = zamax_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
maxf=CABS1(x,0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (maxf);
maxf = zamax_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
} else { } else {
maxf = CABS1(x, 0);
maxf=CABS1(x,0); ix += 2;
inc_x2 = 2 * inc_x; i++;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) > maxf) {
maxf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) > maxf) {
maxf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) > maxf) {
maxf = CABS1(x,ix+inc_x2*3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x,ix) > maxf) {
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (maxf);
} }
while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (maxf);
} else {
maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) > maxf) {
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) > maxf) {
maxf = CABS1(x, ix + inc_x2 * 3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix);
}
ix += inc_x2;
i++;
}
return (maxf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amin;
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) __asm__("vleg %%v0,0(%[x]),0\n\t"
{ "vleg %%v16,8(%[x]),0\n\t"
FLOAT amin; "vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v24,128(%%r1,%[x]),0\n\t"
"vleg %%v25,136(%%r1,%[x]),0\n\t"
"vleg %%v24,144(%%r1,%[x]),1\n\t"
"vleg %%v25,152(%%r1,%[x]),1\n\t"
"vleg %%v26,160(%%r1,%[x]),0\n\t"
"vleg %%v27,168(%%r1,%[x]),0\n\t"
"vleg %%v26,176(%%r1,%[x]),1\n\t"
"vleg %%v27,184(%%r1,%[x]),1\n\t"
"vleg %%v28,192(%%r1,%[x]),0\n\t"
"vleg %%v29,200(%%r1,%[x]),0\n\t"
"vleg %%v28,208(%%r1,%[x]),1\n\t"
"vleg %%v29,216(%%r1,%[x]),1\n\t"
"vleg %%v30,224(%%r1,%[x]),0\n\t"
"vleg %%v31,232(%%r1,%[x]),0\n\t"
"vleg %%v30,240(%%r1,%[x]),1\n\t"
"vleg %%v31,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16,%%v16\n\t"
"vflpdb %%v17,%%v17\n\t"
"vflpdb %%v18,%%v18\n\t"
"vflpdb %%v19,%%v19\n\t"
"vflpdb %%v20,%%v20\n\t"
"vflpdb %%v21,%%v21\n\t"
"vflpdb %%v22,%%v22\n\t"
"vflpdb %%v23,%%v23\n\t"
"vflpdb %%v24,%%v24\n\t"
"vflpdb %%v25,%%v25\n\t"
"vflpdb %%v26,%%v26\n\t"
"vflpdb %%v27,%%v27\n\t"
"vflpdb %%v28,%%v28\n\t"
"vflpdb %%v29,%%v29\n\t"
"vflpdb %%v30,%%v30\n\t"
"vflpdb %%v31,%%v31\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v18,%%v18,%%v19\n\t"
"vfadb %%v20,%%v20,%%v21\n\t"
"vfadb %%v22,%%v22,%%v23\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v26,%%v26,%%v27\n\t"
"vfadb %%v28,%%v28,%%v29\n\t"
"vfadb %%v30,%%v30,%%v31\n\t"
"vfmindb %%v16,%%v16,%%v24,0\n\t"
"vfmindb %%v18,%%v18,%%v26,0\n\t"
"vfmindb %%v20,%%v20,%%v28,0\n\t"
"vfmindb %%v22,%%v22,%%v30,0\n\t"
"vfmindb %%v16,%%v16,%%v20,0\n\t"
"vfmindb %%v18,%%v18,%%v22,0\n\t"
"vfmindb %%v16,%%v16,%%v18,0\n\t"
"vfmindb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,0\n\t"
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
__asm__ volatile ( return amin;
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vleg %%v24,128(%%r1,%2),0 \n\t"
"vleg %%v25,136(%%r1,%2),0 \n\t"
"vleg %%v24,144(%%r1,%2),1 \n\t"
"vleg %%v25,152(%%r1,%2),1 \n\t"
"vleg %%v26,160(%%r1,%2),0 \n\t"
"vleg %%v27,168(%%r1,%2),0 \n\t"
"vleg %%v26,176(%%r1,%2),1 \n\t"
"vleg %%v27,184(%%r1,%2),1 \n\t"
"vleg %%v28,192(%%r1,%2),0 \n\t"
"vleg %%v29,200(%%r1,%2),0 \n\t"
"vleg %%v28,208(%%r1,%2),1 \n\t"
"vleg %%v29,216(%%r1,%2),1 \n\t"
"vleg %%v30,224(%%r1,%2),0 \n\t"
"vleg %%v31,232(%%r1,%2),0 \n\t"
"vleg %%v30,240(%%r1,%2),1 \n\t"
"vleg %%v31,248(%%r1,%2),1 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vflpdb %%v17,%%v17 \n\t"
"vflpdb %%v18,%%v18 \n\t"
"vflpdb %%v19,%%v19 \n\t"
"vflpdb %%v20,%%v20 \n\t"
"vflpdb %%v21,%%v21 \n\t"
"vflpdb %%v22,%%v22 \n\t"
"vflpdb %%v23,%%v23 \n\t"
"vflpdb %%v24,%%v24 \n\t"
"vflpdb %%v25,%%v25 \n\t"
"vflpdb %%v26,%%v26 \n\t"
"vflpdb %%v27,%%v27 \n\t"
"vflpdb %%v28,%%v28 \n\t"
"vflpdb %%v29,%%v29 \n\t"
"vflpdb %%v30,%%v30 \n\t"
"vflpdb %%v31,%%v31 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v18,%%v18,%%v19 \n\t"
"vfadb %%v20,%%v20,%%v21 \n\t"
"vfadb %%v22,%%v22,%%v23 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfmindb %%v16,%%v16,%%v24,0 \n\t"
"vfmindb %%v18,%%v18,%%v26,0 \n\t"
"vfmindb %%v20,%%v20,%%v28,0 \n\t"
"vfmindb %%v22,%%v22,%%v30,0 \n\t"
"vfmindb %%v16,%%v16,%%v20,0 \n\t"
"vfmindb %%v18,%%v18,%%v22,0 \n\t"
"vfmindb %%v16,%%v16,%%v18,0 \n\t"
"vfmindb %%v0,%%v0,%%v16,0 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfmindb %%v0,%%v0,%%v16,0 \n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return amin;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0; BLASLONG ix = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if (n1 > 0) { if (n1 > 0) {
minf = zamin_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
minf=CABS1(x,0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (minf);
minf = zamin_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
} else { } else {
minf = CABS1(x, 0);
minf=CABS1(x,0); ix += 2;
inc_x2 = 2 * inc_x; i++;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) < minf) {
minf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) < minf) {
minf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) < minf) {
minf = CABS1(x,ix+inc_x2*3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (minf);
} }
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (minf);
} else {
minf = CABS1(x, 0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) < minf) {
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) < minf) {
minf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) < minf) {
minf = CABS1(x, ix + inc_x2 * 3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += inc_x2;
i++;
}
return (minf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT amin;
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) __asm__("vleg %%v0,0(%[x]),0\n\t"
{ "vleg %%v16,8(%[x]),0\n\t"
FLOAT amin; "vleg %%v0,16(%[x]),1\n\t"
"vleg %%v16,24(%[x]),1\n\t"
"vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v25,%%v24\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v0,%%v26\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%[x]),1\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v25,%%v24\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v0,%%v26\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
"agfi %%r1, 256\n\t"
"brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %[amin],%%f0"
: [amin] "=f"(amin),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27");
__asm__ volatile ( return amin;
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchdb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchdb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"vleg %%v16,128(%%r1,%2),0 \n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vfchdb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"
"vfchdb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"
"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"
"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v0,%%v16,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);
return amin;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0; BLASLONG ix = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if (n1 > 0) { if (n1 > 0) {
minf = zamin_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
}
else
{
minf=CABS1(x,0);
ix += 2;
i++;
}
while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (minf);
minf = zamin_kernel_16(n1, x);
ix = n1 * 2;
i = n1;
} else { } else {
minf = CABS1(x, 0);
minf=CABS1(x,0); ix += 2;
inc_x2 = 2 * inc_x; i++;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
if (CABS1(x,ix+inc_x2) < minf) {
minf = CABS1(x,ix+inc_x2);
}
if (CABS1(x,ix+inc_x2*2) < minf) {
minf = CABS1(x,ix+inc_x2*2);
}
if (CABS1(x,ix+inc_x2*3) < minf) {
minf = CABS1(x,ix+inc_x2*3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x,ix) < minf) {
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (minf);
} }
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += 2;
i++;
}
return (minf);
} else {
minf = CABS1(x, 0);
inc_x2 = 2 * inc_x;
BLASLONG n1 = n & -4;
while (i < n1) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) < minf) {
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + inc_x2 * 2) < minf) {
minf = CABS1(x, ix + inc_x2 * 2);
}
if (CABS1(x, ix + inc_x2 * 3) < minf) {
minf = CABS1(x, ix + inc_x2 * 3);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix);
}
ix += inc_x2;
i++;
}
return (minf);
}
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,138 +28,126 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
{ FLOAT asum;
FLOAT asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t" __asm__("vzero %%v24\n\t"
"vflpdb %%v17, %%v17 \n\t" "vzero %%v25\n\t"
"vflpdb %%v18, %%v18 \n\t" "vzero %%v26\n\t"
"vflpdb %%v19, %%v19 \n\t" "vzero %%v27\n\t"
"vflpdb %%v20, %%v20 \n\t" "vzero %%v28\n\t"
"vflpdb %%v21, %%v21 \n\t" "vzero %%v29\n\t"
"vflpdb %%v22, %%v22 \n\t" "vzero %%v30\n\t"
"vflpdb %%v23, %%v23 \n\t" "vzero %%v31\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t"
"vflpdb %%v19, %%v19\n\t"
"vflpdb %%v20, %%v20\n\t"
"vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"vfadb %%v0,%%v0,%%v16 \n\t" return asum;
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v2 \n\t"
"vfadb %%v0,%%v0,%%v3 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);
return asum;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
{ BLASLONG i = 0;
BLASLONG i=0; BLASLONG ip = 0;
BLASLONG ip=0; FLOAT sumf = 0.0;
FLOAT sumf = 0.0; BLASLONG n1;
BLASLONG n1; BLASLONG inc_x2;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf); if (n <= 0 || inc_x <= 0)
return (sumf);
if ( inc_x == 1 ) if (inc_x == 1) {
{
n1 = n & -16; n1 = n & -16;
if ( n1 > 0 ) if (n1 > 0) {
{
sumf = zasum_kernel_16(n1, x);
i=n1;
ip=2*n1;
}
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
i++;
ip+=2;
}
sumf = zasum_kernel_16(n1, x);
i = n1;
ip = 2 * n1;
} }
else
{
inc_x2 = 2* inc_x;
while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
ip+=inc_x2;
i++;
}
while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
i++;
ip += 2;
} }
return(sumf);
} else {
inc_x2 = 2 * inc_x;
while (i < n) {
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
ip += inc_x2;
i++;
}
}
return (sumf);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,144 +27,136 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
{ __asm__(
__asm__ volatile(
#if !defined(CONJ) #if !defined(CONJ)
"vlrepg %%v0,0(%3) \n\t" "vlrepg %%v0,0(%[alpha])\n\t"
"vleg %%v1,8(%3),0 \n\t" "vleg %%v1,8(%[alpha]),0\n\t"
"wflcdb %%v1,%%v1 \n\t" "wflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%3),1 \n\t" "vleg %%v1,8(%[alpha]),1\n\t"
#else #else
"vleg %%v0,0(%3),1 \n\t" "vleg %%v0,0(%[alpha]),1\n\t"
"vflcdb %%v0,%%v0 \n\t" "vflcdb %%v0,%%v0\n\t"
"vleg %%v0,0(%3),0 \n\t" "vleg %%v0,0(%[alpha]),0\n\t"
"vlrepg %%v1,8(%3) \n\t" "vlrepg %%v1,8(%[alpha])\n\t"
#endif #endif
"srlg %%r0,%0,3 \n\t" "srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1 \n\t" "xgr %%r1,%%r1\n\t"
"0: \n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v8,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v9,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%1) \n\t" "vl %%v10,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%1) \n\t" "vl %%v11,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%1) \n\t" "vl %%v12,0(%%r1,%[y])\n\t"
"vl %%v20,0(%%r1,%2) \n\t" "vl %%v13,16(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%2) \n\t" "vl %%v14,32(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%2) \n\t" "vl %%v15,48(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%2) \n\t" "vl %%v16,64(%%r1,%[x])\n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t" "vl %%v17,80(%%r1,%[x])\n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t" "vl %%v18,96(%%r1,%[x])\n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t" "vl %%v19,112(%%r1,%[x])\n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t" "vl %%v20,64(%%r1,%[y])\n\t"
"vl %%v21,80(%%r1,%[y])\n\t"
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" "vl %%v22,96(%%r1,%[y])\n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" "vl %%v23,112(%%r1,%[y])\n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" "vpdi %%v24,%%v8,%%v8,4\n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" "vpdi %%v25,%%v9,%%v9,4\n\t"
"vpdi %%v26,%%v10,%%v10,4\n\t"
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" "vpdi %%v27,%%v11,%%v11,4\n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" "vpdi %%v28,%%v16,%%v16,4\n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" "vpdi %%v29,%%v17,%%v17,4\n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" "vpdi %%v30,%%v18,%%v18,4\n\t"
"vpdi %%v31,%%v19,%%v19,4\n\t"
"vst %%v28,0(%%r1,%2) \n\t" "vfmadb %%v8,%%v8,%%v0,%%v12\n\t"
"vst %%v29,16(%%r1,%2) \n\t" "vfmadb %%v9,%%v9,%%v0,%%v13\n\t"
"vst %%v30,32(%%r1,%2) \n\t" "vfmadb %%v10,%%v10,%%v0,%%v14\n\t"
"vst %%v31,48(%%r1,%2) \n\t" "vfmadb %%v11,%%v11,%%v0,%%v15\n\t"
"vfmadb %%v16,%%v16,%%v0,%%v20\n\t"
"vl %%v16,64(%%r1,%1) \n\t" "vfmadb %%v17,%%v17,%%v0,%%v21\n\t"
"vl %%v17,80(%%r1,%1) \n\t" "vfmadb %%v18,%%v18,%%v0,%%v22\n\t"
"vl %%v18,96(%%r1,%1) \n\t" "vfmadb %%v19,%%v19,%%v0,%%v23\n\t"
"vl %%v19,112(%%r1,%1) \n\t" "vfmadb %%v8,%%v24,%%v1,%%v8\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vfmadb %%v9,%%v25,%%v1,%%v9\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vfmadb %%v10,%%v26,%%v1,%%v10\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vfmadb %%v11,%%v27,%%v1,%%v11\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vfmadb %%v16,%%v28,%%v1,%%v16\n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t" "vfmadb %%v17,%%v29,%%v1,%%v17\n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t" "vfmadb %%v18,%%v30,%%v1,%%v18\n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t" "vfmadb %%v19,%%v31,%%v1,%%v19\n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t" "vst %%v8,0(%%r1,%[y])\n\t"
"vst %%v9,16(%%r1,%[y])\n\t"
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" "vst %%v10,32(%%r1,%[y])\n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" "vst %%v11,48(%%r1,%[y])\n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" "vst %%v16,64(%%r1,%[y])\n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" "vst %%v17,80(%%r1,%[y])\n\t"
"vst %%v18,96(%%r1,%[y])\n\t"
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" "vst %%v19,112(%%r1,%[y])\n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" "agfi %%r1,128\n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" "brctg %[n],0b"
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"vst %%v28,64(%%r1,%2) \n\t" "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
"vst %%v29,80(%%r1,%2) \n\t" : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
"vst %%v30,96(%%r1,%2) \n\t" "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"vst %%v31,112(%%r1,%2) \n\t" "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG i = 0; FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG ix = 0, iy = 0; BLASLONG dummy2) {
FLOAT da[2] __attribute__ ((aligned(16))); BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2] __attribute__ ((aligned(16)));
if (n <= 0) return (0); if (n <= 0)
return (0);
if ((inc_x == 1) && (inc_y == 1)) { if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -8;
if (n1) {
da[0] = da_r;
da[1] = da_i;
zaxpy_kernel_8(n1, x, y, da);
ix = 2 * n1;
}
i = n1;
while (i < n) {
#if !defined(CONJ)
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
i++;
ix += 2;
}
return (0);
BLASLONG n1 = n & -8;
if (n1) {
da[0] = da_r;
da[1] = da_i;
zaxpy_kernel_8(n1, x, y, da);
ix = 2 * n1;
} }
i = n1;
inc_x *= 2;
inc_y *= 2;
while (i < n) { while (i < n) {
#if !defined(CONJ) #if !defined(CONJ)
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else #else
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif #endif
ix += inc_x; i++;
iy += inc_y; ix += 2;
i++;
} }
return (0); return (0);
}
inc_x *= 2;
inc_y *= 2;
while (i < n) {
#if !defined(CONJ)
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
ix += inc_x;
iy += inc_y;
i++;
}
return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,73 +27,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],4\n\t"
__asm__ volatile ( "0:\n\t"
"lgr %%r1,%1 \n\t" "pfd 1, 1024(%[x])\n\t"
"lgr %%r2,%2 \n\t" "pfd 2, 1024(%[y])\n\t"
"srlg %%r0,%0,4 \n\t" "mvc 0(256,%[y]),0(%[x])\n\t"
"0: \n\t" "la %[x],256(%[x])\n\t"
"pfd 1, 1024(%%r1) \n\t" "la %[y],256(%[y])\n\t"
"pfd 2, 1024(%%r2) \n\t" "brctg %[n],0b"
"mvc 0(256,%%r2),0(%%r1) \n\t" : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
"agfi %%r1,256 \n\t" [n] "+&r"(n)
"agfi %%r2,256 \n\t" : "m"(*(const struct { FLOAT x[n * 2]; } *) x)
"brctg %%r0,0b " : "cc");
:
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","r2"
);
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
{ BLASLONG i = 0;
BLASLONG i=0; BLASLONG ix = 0, iy = 0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 )) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if (n1 > 0) {
{ zcopy_kernel_16(n1, x, y);
zcopy_kernel_16(n1, x, y); i = n1;
i=n1; ix = n1 * 2;
ix=n1*2; iy = n1 * 2;
iy=n1*2; }
}
while(i < n)
{
y[iy] = x[iy] ;
y[iy+1] = x[ix+1] ;
ix+=2;
iy+=2;
i++ ;
}
while (i < n) {
y[iy] = x[iy];
y[iy + 1] = x[ix + 1];
ix += 2;
iy += 2;
i++;
} }
else
{
BLASLONG inc_x2 = 2 * inc_x; } else {
BLASLONG inc_y2 = 2 * inc_y;
while(i < n) BLASLONG inc_x2 = 2 * inc_x;
{ BLASLONG inc_y2 = 2 * inc_y;
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
} while (i < n) {
y[iy] = x[ix];
y[iy + 1] = x[ix + 1];
ix += inc_x2;
iy += inc_y2;
i++;
} }
return(0); }
return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,152 +27,146 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
{ __asm__("vzero %%v24\n\t"
__asm__ volatile( "vzero %%v25\n\t"
"vzero %%v24 \n\t" "vzero %%v26\n\t"
"vzero %%v25 \n\t" "vzero %%v27\n\t"
"vzero %%v26 \n\t" "vzero %%v28\n\t"
"vzero %%v27 \n\t" "vzero %%v29\n\t"
"vzero %%v28 \n\t" "vzero %%v30\n\t"
"vzero %%v29 \n\t" "vzero %%v31\n\t"
"vzero %%v30 \n\t" "srlg %[n],%[n],3\n\t"
"vzero %%v31 \n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,3 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"0: \n\t" "pfd 1, 1024(%%r1,%[y])\n\t"
"pfd 1, 1024(%%r1,%1) \n\t" "vl %%v16, 0(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v0, 0(%%r1,%2) \n\t" "vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%2) \n\t" "vpdi %%v20,%%v16,%%v16,4\n\t"
"vl %%v2, 32(%%r1,%2) \n\t" "vpdi %%v21,%%v17,%%v17,4\n\t"
"vl %%v3, 48(%%r1,%2) \n\t" "vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t" "vpdi %%v23,%%v19,%%v19,4\n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t" "vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t" "vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" "vfmadb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" "vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" "vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" "vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" "vl %%v16, 64(%%r1,%[x])\n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" "vl %%v17, 80(%%r1,%[x])\n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" "vl %%v18, 96(%%r1,%[x])\n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" "vl %%v19, 112(%%r1,%[x])\n\t"
"vl %%v0, 64(%%r1,%[y])\n\t"
"vl %%v16, 64(%%r1,%1) \n\t" "vl %%v1, 80(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%1) \n\t" "vl %%v2, 96(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%1) \n\t" "vl %%v3, 112(%%r1,%[y])\n\t"
"vl %%v19, 112(%%r1,%1) \n\t" "vpdi %%v20,%%v16,%%v16,4\n\t"
"vl %%v0, 64(%%r1,%2) \n\t" "vpdi %%v21,%%v17,%%v17,4\n\t"
"vl %%v1, 80(%%r1,%2) \n\t" "vpdi %%v22,%%v18,%%v18,4\n\t"
"vl %%v2, 96(%%r1,%2) \n\t" "vpdi %%v23,%%v19,%%v19,4\n\t"
"vl %%v3, 112(%%r1,%2) \n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t" "vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t" "vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t" "vfmadb %%v27,%%v21,%%v1,%%v27\n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t" "vfmadb %%v28,%%v18,%%v2,%%v28\n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" "vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" "vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" "agfi %%r1,128\n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" "brctg %[n],0b\n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" "vfadb %%v24,%%v24,%%v26\n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" "vfadb %%v24,%%v24,%%v28\n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" "vfadb %%v24,%%v24,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" "vfadb %%v25,%%v25,%%v27\n\t"
"vfadb %%v25,%%v25,%%v29\n\t"
"agfi %%r1,128 \n\t" "vfadb %%v25,%%v25,%%v31\n\t"
"brctg %%r0,0b \n\t" "vsteg %%v24,0(%[d]),0\n\t"
"vfadb %%v24,%%v24,%%v26 \n\t" "vsteg %%v24,8(%[d]),1\n\t"
"vfadb %%v24,%%v24,%%v28 \n\t" "vsteg %%v25,16(%[d]),1\n\t"
"vfadb %%v24,%%v24,%%v30 \n\t" "vsteg %%v25,24(%[d]),0"
"vfadb %%v25,%%v25,%%v27 \n\t" : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
"vfadb %%v25,%%v25,%%v29 \n\t" : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"vfadb %%v25,%%v25,%%v31 \n\t" "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
"vsteg %%v24,0(%3),0 \n\t" : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"vsteg %%v24,8(%3),1 \n\t" "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"vsteg %%v25,16(%3),1 \n\t" "v31");
"vsteg %%v25,24(%3),0 "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG i; BLASLONG inc_y) {
BLASLONG ix, iy; BLASLONG i;
OPENBLAS_COMPLEX_FLOAT result; BLASLONG ix, iy;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {
0.0, 0.0, 0.0, 0.0};
if (n <= 0) { if (n <= 0) {
CREAL(result) = 0.0; CREAL(result) = 0.0;
CIMAG(result) = 0.0; CIMAG(result) = 0.0;
return (result); return (result);
}
if ((inc_x == 1) && (inc_y == 1)) {
BLASLONG n1 = n & -8;
if (n1)
zdot_kernel_8(n1, x, y, dot);
i = n1;
BLASLONG j = i * 2;
while (i < n) {
dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];
j += 2;
i++;
} }
if ((inc_x == 1) && (inc_y == 1)) { } else {
i = 0;
ix = 0;
iy = 0;
inc_x <<= 1;
inc_y <<= 1;
while (i < n) {
BLASLONG n1 = n & -8; dot[0] += x[ix] * y[iy];
dot[1] += x[ix + 1] * y[iy + 1];
dot[2] += x[ix] * y[iy + 1];
dot[3] += x[ix + 1] * y[iy];
if (n1) ix += inc_x;
zdot_kernel_8(n1, x, y, dot); iy += inc_y;
i++;
i = n1;
BLASLONG j = i * 2;
while (i < n) {
dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];
j += 2;
i++;
}
} else {
i = 0;
ix = 0;
iy = 0;
inc_x <<= 1;
inc_y <<= 1;
while (i < n) {
dot[0] += x[ix] * y[iy];
dot[1] += x[ix + 1] * y[iy + 1];
dot[2] += x[ix] * y[iy + 1];
dot[3] += x[ix + 1] * y[iy];
ix += inc_x;
iy += inc_y;
i++;
}
} }
}
#if !defined(CONJ) #if !defined(CONJ)
CREAL(result) = dot[0] - dot[1]; CREAL(result) = dot[0] - dot[1];
CIMAG(result) = dot[2] + dot[3]; CIMAG(result) = dot[2] + dot[3];
#else #else
CREAL(result) = dot[0] + dot[1]; CREAL(result) = dot[0] + dot[1];
CIMAG(result) = dot[2] - dot[3]; CIMAG(result) = dot[2] - dot[3];
#endif #endif
return (result); return (result);
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,230 +27,210 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
{ __asm__("vlrepg %%v0,%[c]\n\t"
__asm__ ( "vlrepg %%v1,%[s]\n\t"
"vlrepg %%v0,%3 \n\t" "srlg %[n],%[n],4\n\t"
"vlrepg %%v1,%4 \n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,4 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"0: \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "vl %%v24, 0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v24, 0(%%r1,%1) \n\t" "vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%1) \n\t" "vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%1) \n\t" "vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v27, 48(%%r1,%1) \n\t" "vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ /* 2nd parts */
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v28, 0(%%r1,%[x])\n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v29, 16(%%r1,%[x])\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v28, 0(%%r1,%1) \n\t" "vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v29, 16(%%r1,%1) \n\t" "vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v30, 32(%%r1,%1) \n\t" "vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v31, 48(%%r1,%1) \n\t" "vst %%v23, 48(%%r1,%[y])\n\t"
"vst %%v20, 0(%%r1,%2) \n\t" "vl %%v24, 64(%%r1,%[x])\n\t"
"vst %%v21, 16(%%r1,%2) \n\t" "vl %%v25, 80(%%r1,%[x])\n\t"
"vst %%v22, 32(%%r1,%2) \n\t" "vl %%v26, 96(%%r1,%[x])\n\t"
"vst %%v23, 48(%%r1,%2) \n\t" "vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v24, 64(%%r1,%1) \n\t" "vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v25, 80(%%r1,%1) \n\t" "vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v26, 96(%%r1,%1) \n\t" "vl %%v19, 112(%%r1,%[y])\n\t"
"vl %%v27, 112(%%r1,%1) \n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vl %%v16, 64(%%r1,%2) \n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vl %%v17, 80(%%r1,%2) \n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vl %%v18, 96(%%r1,%2) \n\t" "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vl %%v19, 112(%%r1,%2) \n\t" "vfmdb %%v30,%%v26,%%v0\n\t"
"vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmdb %%v31,%%v27,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ /* 2nd parts */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
/* 2nd parts*/ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vst %%v28, 64(%%r1,%[x])\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vst %%v29, 80(%%r1,%[x])\n\t"
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v30, 96(%%r1,%[x])\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v31, 112(%%r1,%[x])\n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v28, 64(%%r1,%1) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v29, 80(%%r1,%1) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%1) \n\t" "vl %%v25, 144(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vl %%v16, 128(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vl %%v17, 144(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v19, 176(%%r1,%[y])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vl %%v27, 176(%%r1,%1) \n\t" "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vl %%v16, 128(%%r1,%2) \n\t" "vfmdb %%v30,%%v26,%%v0\n\t"
"vl %%v17, 144(%%r1,%2) \n\t" "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vl %%v18, 160(%%r1,%2) \n\t" "vfmdb %%v31,%%v27,%%v0\n\t"
"vl %%v19, 176(%%r1,%2) \n\t" "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
/* 2nd parts */
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmdb %%v31,%%v27,%%v0 \n\t" "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
/* 2nd parts*/ "vst %%v28, 128(%%r1,%[x])\n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vst %%v29, 144(%%r1,%[x])\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vst %%v30, 160(%%r1,%[x])\n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vst %%v31, 176(%%r1,%[x])\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vst %%v20, 128(%%r1,%[y])\n\t"
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "vst %%v21, 144(%%r1,%[y])\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "vst %%v22, 160(%%r1,%[y])\n\t"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" "vst %%v23, 176(%%r1,%[y])\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vst %%v28, 128(%%r1,%1) \n\t" "vl %%v26, 224(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%1) \n\t" "vl %%v27, 240(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%1) \n\t" "vl %%v16, 192(%%r1,%[y])\n\t"
"vst %%v31, 176(%%r1,%1) \n\t" "vl %%v17, 208(%%r1,%[y])\n\t"
"vst %%v20, 128(%%r1,%2) \n\t" "vl %%v18, 224(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%2) \n\t" "vl %%v19, 240(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%2) \n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vst %%v23, 176(%%r1,%2) \n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
"vl %%v24, 192(%%r1,%1) \n\t" "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */
"vl %%v25, 208(%%r1,%1) \n\t" "vfmdb %%v30,%%v26,%%v0\n\t"
"vl %%v26, 224(%%r1,%1) \n\t" "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */
"vl %%v27, 240(%%r1,%1) \n\t" "vfmdb %%v31,%%v27,%%v0\n\t"
"vl %%v16, 192(%%r1,%2) \n\t" "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */
"vl %%v17, 208(%%r1,%2) \n\t" /* 2nd parts */
"vl %%v18, 224(%%r1,%2) \n\t" "vfmadb %%v28,%%v16,%%v1,%%v28\n\t"
"vl %%v19, 240(%%r1,%2) \n\t" "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29\n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t" "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */
"vfmdb %%v29,%%v25,%%v0 \n\t" "vfmadb %%v30,%%v18,%%v1,%%v30\n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmdb %%v30,%%v26,%%v0 \n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ "vst %%v28, 192(%%r1,%[x])\n\t"
"vfmdb %%v31,%%v27,%%v0 \n\t" "vst %%v29, 208(%%r1,%[x])\n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ "vst %%v30, 224(%%r1,%[x])\n\t"
/* 2nd parts*/ "vst %%v31, 240(%%r1,%[x])\n\t"
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" "vst %%v20, 192(%%r1,%[y])\n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ "vst %%v21, 208(%%r1,%[y])\n\t"
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" "vst %%v22, 224(%%r1,%[y])\n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ "vst %%v23, 240(%%r1,%[y])\n\t"
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" "agfi %%r1,256\n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ "brctg %[n],0b"
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" : "+m"(*(struct { FLOAT x[n * 2]; } *) x),
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
"vst %%v28, 192(%%r1,%1) \n\t" : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"vst %%v29, 208(%%r1,%1) \n\t" "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"vst %%v30, 224(%%r1,%1) \n\t" "v31");
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
{ FLOAT c, FLOAT s) {
BLASLONG i=0; BLASLONG i = 0;
BLASLONG ix=0,iy=0; BLASLONG ix = 0, iy = 0;
FLOAT temp[2]; FLOAT temp[2];
BLASLONG inc_x2; BLASLONG inc_x2;
BLASLONG inc_y2; BLASLONG inc_y2;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if (n1 > 0) {
{ FLOAT cosa, sina;
FLOAT cosa,sina; cosa = c;
cosa=c; sina = s;
sina=s; zrot_kernel_16(n1, x, y, &cosa, &sina);
zrot_kernel_16(n1, x, y, &cosa, &sina); i = n1;
i=n1; ix = 2 * n1;
ix=2*n1; }
}
while(i < n) while (i < n) {
{ temp[0] = c * x[ix] + s * y[ix];
temp[0] = c*x[ix] + s*y[ix] ; temp[1] = c * x[ix + 1] + s * y[ix + 1];
temp[1] = c*x[ix+1] + s*y[ix+1] ; y[ix] = c * y[ix] - s * x[ix];
y[ix] = c*y[ix] - s*x[ix] ; y[ix + 1] = c * y[ix + 1] - s * x[ix + 1];
y[ix+1] = c*y[ix+1] - s*x[ix+1] ; x[ix] = temp[0];
x[ix] = temp[0] ; x[ix + 1] = temp[1];
x[ix+1] = temp[1] ;
ix += 2 ;
i++ ;
}
ix += 2;
i++;
} }
else
{
inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;
while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;
ix += inc_x2 ; } else {
iy += inc_y2 ; inc_x2 = 2 * inc_x;
i++ ; inc_y2 = 2 * inc_y;
while (i < n) {
temp[0] = c * x[ix] + s * y[iy];
temp[1] = c * x[ix + 1] + s * y[iy + 1];
y[iy] = c * y[iy] - s * x[ix];
y[iy + 1] = c * y[iy + 1] - s * x[ix + 1];
x[ix] = temp[0];
x[ix + 1] = temp[1];
} ix += inc_x2;
iy += inc_y2;
i++;
} }
return(0);
}
return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013 - 2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,426 +27,399 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) {
{ __asm__("vlrepg %%v0,0(%[alpha])\n\t"
__asm__ volatile( "vleg %%v1,8(%[alpha]),0\n\t"
"vlrepg %%v0,0(%1) \n\t" "wflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%1),0 \n\t" "vleg %%v1,8(%[alpha]),1\n\t"
"wflcdb %%v1,%%v1 \n\t" "srlg %[n],%[n],3\n\t"
"vleg %%v1,8(%1),1 \n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,3 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"0: \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vpdi %%v24,%%v16,%%v16,4\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vpdi %%v25,%%v17,%%v17,4\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vpdi %%v26,%%v18,%%v18,4\n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t" "vpdi %%v27,%%v19,%%v19,4\n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t" "vpdi %%v28,%%v20,%%v20,4\n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t" "vpdi %%v29,%%v21,%%v21,4\n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t" "vpdi %%v30,%%v22,%%v22,4\n\t"
"vpdi %%v28,%%v20,%%v20,4 \n\t" "vpdi %%v31,%%v23,%%v23,4\n\t"
"vpdi %%v29,%%v21,%%v21,4 \n\t" "vfmdb %%v16,%%v16,%%v0\n\t"
"vpdi %%v30,%%v22,%%v22,4 \n\t" "vfmdb %%v17,%%v17,%%v0\n\t"
"vpdi %%v31,%%v23,%%v23,4 \n\t" "vfmdb %%v18,%%v18,%%v0\n\t"
"vfmdb %%v19,%%v19,%%v0\n\t"
"vfmdb %%v16,%%v16,%%v0 \n\t" "vfmdb %%v20,%%v20,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t" "vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t" "vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t" "vfmdb %%v23,%%v23,%%v0\n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t" "vfmadb %%v16,%%v24,%%v1,%%v16\n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t" "vfmadb %%v17,%%v25,%%v1,%%v17\n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t" "vfmadb %%v18,%%v26,%%v1,%%v18\n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t" "vfmadb %%v19,%%v27,%%v1,%%v19\n\t"
"vfmadb %%v16,%%v24,%%v1,%%v16 \n\t" "vfmadb %%v20,%%v28,%%v1,%%v20\n\t"
"vfmadb %%v17,%%v25,%%v1,%%v17 \n\t" "vfmadb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmadb %%v18,%%v26,%%v1,%%v18 \n\t" "vfmadb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmadb %%v19,%%v27,%%v1,%%v19 \n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"vfmadb %%v20,%%v28,%%v1,%%v20 \n\t" "vst %%v16,0(%%r1,%[x])\n\t"
"vfmadb %%v21,%%v29,%%v1,%%v21 \n\t" "vst %%v17,16(%%r1,%[x])\n\t"
"vfmadb %%v22,%%v30,%%v1,%%v22 \n\t" "vst %%v18,32(%%r1,%[x])\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23 \n\t" "vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vst %%v23,112(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "agfi %%r1,128\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "brctg %[n],0b"
"vst %%v21,80(%%r1,%2) \n\t" : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
"vst %%v22,96(%%r1,%2) \n\t" : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
"vst %%v23,112(%%r1,%2) \n\t" [alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"agfi %%r1,128 \n\t" "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"brctg %%r0,0b " "v31");
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vleg %%v0,8(%1),0 \n\t"
"wflcdb %%v0,%%v0 \n\t"
"vleg %%v0,8(%1),1 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v16,%%v16,%%v16,4 \n\t"
"vpdi %%v17,%%v17,%%v17,4 \n\t"
"vpdi %%v18,%%v18,%%v18,4 \n\t"
"vpdi %%v19,%%v19,%%v19,4 \n\t"
"vpdi %%v20,%%v20,%%v20,4 \n\t"
"vpdi %%v21,%%v21,%%v21,4 \n\t"
"vpdi %%v22,%%v22,%%v22,4 \n\t"
"vpdi %%v23,%%v23,%%v23,4 \n\t"
"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
} }
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
{ __asm__("vleg %%v0,8(%[alpha]),0\n\t"
__asm__ volatile( "wflcdb %%v0,%%v0\n\t"
"vlrepg %%v0,0(%1) \n\t" "vleg %%v0,8(%[alpha]),1\n\t"
"srlg %%r0,%0,3 \n\t" "srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1 \n\t" "xgr %%r1,%%r1\n\t"
"0: \n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vpdi %%v16,%%v16,%%v16,4\n\t"
"vpdi %%v17,%%v17,%%v17,4\n\t"
"vfmdb %%v16,%%v16,%%v0 \n\t" "vpdi %%v18,%%v18,%%v18,4\n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t" "vpdi %%v19,%%v19,%%v19,4\n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t" "vpdi %%v20,%%v20,%%v20,4\n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t" "vpdi %%v21,%%v21,%%v21,4\n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t" "vpdi %%v22,%%v22,%%v22,4\n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t" "vpdi %%v23,%%v23,%%v23,4\n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t" "vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t" "vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vfmdb %%v19,%%v19,%%v0\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vfmdb %%v20,%%v20,%%v0\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vfmdb %%v21,%%v21,%%v0\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "vfmdb %%v22,%%v22,%%v0\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "vfmdb %%v23,%%v23,%%v0\n\t"
"vst %%v21,80(%%r1,%2) \n\t" "vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%2) \n\t" "vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%2) \n\t" "vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%[x])\n\t"
"agfi %%r1,128 \n\t" "vst %%v20,64(%%r1,%[x])\n\t"
"brctg %%r0,0b " "vst %%v21,80(%%r1,%[x])\n\t"
: "vst %%v22,96(%%r1,%[x])\n\t"
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) "vst %%v23,112(%%r1,%[x])\n\t"
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" "agfi %%r1,128\n\t"
); "brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
} }
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
{ __asm__("vlrepg %%v0,0(%[alpha])\n\t"
__asm__ volatile( "srlg %[n],%[n],3\n\t"
"vzero %%v24 \n\t" "xgr %%r1,%%r1\n\t"
"vzero %%v25 \n\t" "0:\n\t"
"vzero %%v26 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vzero %%v27 \n\t" "vl %%v16,0(%%r1,%[x])\n\t"
"srlg %%r0,%0,3 \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"xgr %%r1,%%r1 \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"0: \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%[x])\n\t"
"vst %%v24,0(%%r1,%1) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vst %%v25,16(%%r1,%1) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vst %%v26,32(%%r1,%1) \n\t" "vfmdb %%v16,%%v16,%%v0\n\t"
"vst %%v27,48(%%r1,%1) \n\t" "vfmdb %%v17,%%v17,%%v0\n\t"
"vst %%v24,64(%%r1,%1) \n\t" "vfmdb %%v18,%%v18,%%v0\n\t"
"vst %%v25,80(%%r1,%1) \n\t" "vfmdb %%v19,%%v19,%%v0\n\t"
"vst %%v26,96(%%r1,%1) \n\t" "vfmdb %%v20,%%v20,%%v0\n\t"
"vst %%v27,112(%%r1,%1) \n\t" "vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t"
"agfi %%r1,128 \n\t" "vfmdb %%v23,%%v23,%%v0\n\t"
"brctg %%r0,0b " "vst %%v16,0(%%r1,%[x])\n\t"
: "vst %%v17,16(%%r1,%[x])\n\t"
:"r"(n),"ZR"((FLOAT (*)[n * 2])x) "vst %%v18,32(%%r1,%[x])\n\t"
:"memory","cc","r0","r1","v24","v25","v26","v27" "vst %%v19,48(%%r1,%[x])\n\t"
); "vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t"
"brctg %[n],0b"
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
[alpha] "a"(alpha)
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
} }
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
{ __asm__("vzero %%v0\n\t"
BLASLONG i; "srlg %[n],%[n],3\n\t"
BLASLONG inc_x2 = 2 * inc_x; "xgr %%r1,%%r1\n\t"
BLASLONG inc_x3 = inc_x2 + inc_x; "0:\n\t"
FLOAT t0, t1, t2, t3; "pfd 2, 1024(%%r1,%[x])\n\t"
FLOAT da_r = alpha[0]; "vst %%v0,0(%%r1,%[x])\n\t"
FLOAT da_i = alpha[1]; "vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v0,32(%%r1,%[x])\n\t"
for (i = 0; i < n; i += 4) "vst %%v0,48(%%r1,%[x])\n\t"
{ "vst %%v0,64(%%r1,%[x])\n\t"
t0 = da_r * x[0] - da_i * x[1]; "vst %%v0,80(%%r1,%[x])\n\t"
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; "vst %%v0,96(%%r1,%[x])\n\t"
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; "vst %%v0,112(%%r1,%[x])\n\t"
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; "agfi %%r1,128\n\t"
"brctg %[n],0b"
x[1] = da_i * x[0] + da_r * x[1]; : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; : [x] "a"(x)
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; : "cc", "r1", "v0");
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];
x[0] = t0;
x[inc_x] = t1;
x[inc_x2] = t2;
x[inc_x3] = t3;
x += 4 * inc_x;
}
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
BLASLONG i = 0, j = 0; BLASLONG inc_x) {
FLOAT temp0; BLASLONG i;
FLOAT temp1; BLASLONG inc_x2 = 2 * inc_x;
FLOAT alpha[2] __attribute__ ((aligned(16))); BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];
if (inc_x != 1) { for (i = 0; i < n; i += 4) {
inc_x <<= 1; t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];
if (da_r == 0.0) { x[1] = da_i * x[0] + da_r * x[1];
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];
BLASLONG n1 = n & -2; x[0] = t0;
x[inc_x] = t1;
x[inc_x2] = t2;
x[inc_x3] = t3;
if (da_i == 0.0) { x += 4 * inc_x;
}
}
while (j < n1) { int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
x[i] = 0.0; BLASLONG dummy2) {
x[i + 1] = 0.0; BLASLONG i = 0, j = 0;
x[i + inc_x] = 0.0; FLOAT temp0;
x[i + 1 + inc_x] = 0.0; FLOAT temp1;
i += 2 * inc_x; FLOAT alpha[2] __attribute__ ((aligned(16)));
j += 2;
}
while (j < n) {
x[i] = 0.0;
x[i + 1] = 0.0;
i += inc_x;
j++;
}
} else {
while (j < n1) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
temp1 = -da_i * x[i + 1 + inc_x];
x[i + 1 + inc_x] = da_i * x[i + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
} else {
if (da_i == 0.0) {
BLASLONG n1 = n & -2;
while (j < n1) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
temp1 = da_r * x[i + inc_x];
x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += inc_x;
j++;
}
} else {
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
zscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
}
return (0);
}
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
if (da_r == 0.0)
if (da_i == 0)
zscal_kernel_8_zero(n1, x);
else
zscal_kernel_8_zero_r(n1, alpha, x);
else
if (da_i == 0)
zscal_kernel_8_zero_i(n1, alpha, x);
else
zscal_kernel_8(n1, alpha, x);
i = n1 << 1;
j = n1;
}
if (inc_x != 1) {
inc_x <<= 1;
if (da_r == 0.0) { if (da_r == 0.0) {
if (da_i == 0.0) { BLASLONG n1 = n & -2;
while (j < n) { if (da_i == 0.0) {
x[i] = 0.0; while (j < n1) {
x[i + 1] = 0.0;
i += 2;
j++;
} x[i] = 0.0;
x[i + 1] = 0.0;
} else { x[i + inc_x] = 0.0;
x[i + 1 + inc_x] = 0.0;
while (j < n) { i += 2 * inc_x;
j += 2;
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
} }
while (j < n) {
x[i] = 0.0;
x[i + 1] = 0.0;
i += inc_x;
j++;
}
} else {
while (j < n1) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
temp1 = -da_i * x[i + 1 + inc_x];
x[i + 1 + inc_x] = da_i * x[i + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;
}
while (j < n) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
} else { } else {
if (da_i == 0.0) { if (da_i == 0.0) {
BLASLONG n1 = n & -2;
while (j < n) { while (j < n1) {
temp0 = da_r * x[i]; temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1]; x[i + 1] = da_r * x[i + 1];
x[i] = temp0; x[i] = temp0;
i += 2; temp1 = da_r * x[i + inc_x];
j++; x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
x[i + inc_x] = temp1;
} i += 2 * inc_x;
j += 2;
} else {
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
} }
while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += inc_x;
j++;
}
} else {
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
zscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;
}
}
} }
return (0); return (0);
}
BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
if (da_r == 0.0)
if (da_i == 0)
zscal_kernel_8_zero(n1, x);
else
zscal_kernel_8_zero_r(n1, alpha, x);
else if (da_i == 0)
zscal_kernel_8_zero_i(n1, alpha, x);
else
zscal_kernel_8(n1, alpha, x);
i = n1 << 1;
j = n1;
}
if (da_r == 0.0) {
if (da_i == 0.0) {
while (j < n) {
x[i] = 0.0;
x[i + 1] = 0.0;
i += 2;
j++;
}
} else {
while (j < n) {
temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
}
} else {
if (da_i == 0.0) {
while (j < n) {
temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += 2;
j++;
}
} else {
while (j < n) {
temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += 2;
j++;
}
}
}
return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,157 +27,143 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],4\n\t"
__asm__ volatile( "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,4 \n\t" "0:\n\t"
"xgr %%r1,%%r1 \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"0: \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "vl %%v16, 0(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%1) \n\t" "vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%1) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%1) \n\t" "vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%1) \n\t" "vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%1) \n\t" "vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%1) \n\t" "vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v30, 224(%%r1,%1) \n\t" "vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v31, 240(%%r1,%1) \n\t" "vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v0, 0(%%r1,%2) \n\t" "vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%2) \n\t" "vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%2) \n\t" "vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%2) \n\t" "vl %%v7, 112(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%2) \n\t" "vst %%v0, 0(%%r1,%[x])\n\t"
"vl %%v5, 80(%%r1,%2) \n\t" "vst %%v1, 16(%%r1,%[x])\n\t"
"vl %%v6, 96(%%r1,%2) \n\t" "vst %%v2, 32(%%r1,%[x])\n\t"
"vl %%v7, 112(%%r1,%2) \n\t" "vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v0, 0(%%r1,%1) \n\t" "vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%1) \n\t" "vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%1) \n\t" "vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%1) \n\t" "vst %%v7, 112(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%1) \n\t" "vl %%v0, 128(%%r1,%[y])\n\t"
"vst %%v5, 80(%%r1,%1) \n\t" "vl %%v1, 144(%%r1,%[y])\n\t"
"vst %%v6, 96(%%r1,%1) \n\t" "vl %%v2, 160(%%r1,%[y])\n\t"
"vst %%v7, 112(%%r1,%1) \n\t" "vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v0, 128(%%r1,%2) \n\t" "vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%2) \n\t" "vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%2) \n\t" "vl %%v7, 240(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%2) \n\t" "vst %%v0, 128(%%r1,%[x])\n\t"
"vl %%v4, 192(%%r1,%2) \n\t" "vst %%v1, 144(%%r1,%[x])\n\t"
"vl %%v5, 208(%%r1,%2) \n\t" "vst %%v2, 160(%%r1,%[x])\n\t"
"vl %%v6, 224(%%r1,%2) \n\t" "vst %%v3, 176(%%r1,%[x])\n\t"
"vl %%v7, 240(%%r1,%2) \n\t" "vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v0, 128(%%r1,%1) \n\t" "vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%1) \n\t" "vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%1) \n\t" "vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%1) \n\t" "vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v4, 192(%%r1,%1) \n\t" "vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v5, 208(%%r1,%1) \n\t" "vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v6, 224(%%r1,%1) \n\t" "vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v7, 240(%%r1,%1) \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v16, 0(%%r1,%2) \n\t" "vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%2) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%2) \n\t" "vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%2) \n\t" "vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%2) \n\t" "vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%2) \n\t" "vst %%v31, 240(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%2) \n\t" "agfi %%r1,256\n\t"
"vst %%v27, 176(%%r1,%2) \n\t" "brctg %[n],0b"
"vst %%v28, 192(%%r1,%2) \n\t" : "+m"(*(struct { FLOAT x[n * 2]; } *) x),
"vst %%v29, 208(%%r1,%2) \n\t" "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
"vst %%v30, 224(%%r1,%2) \n\t" : [x] "a"(x),[y] "a"(y)
"vst %%v31, 240(%%r1,%2) \n\t" : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"agfi %%r1,256 \n\t" "v27", "v28", "v29", "v30", "v31");
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
{ FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
BLASLONG i=0; FLOAT *dummy, BLASLONG dummy2) {
BLASLONG ix=0,iy=0; BLASLONG i = 0;
FLOAT temp[2]; BLASLONG ix = 0, iy = 0;
BLASLONG inc_x2, inc_y2; FLOAT temp[2];
BLASLONG inc_x2, inc_y2;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 )) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if (n1 > 0) {
{ zswap_kernel_16(n1, x, y);
zswap_kernel_16(n1, x, y); i = n1;
i=n1; ix = 2 * n1;
ix = 2* n1; iy = 2 * n1;
iy = 2* n1; }
}
while(i < n) while (i < n) {
{
temp[0] = x[ix] ; temp[0] = x[ix];
temp[1] = x[ix+1] ; temp[1] = x[ix + 1];
x[ix] = y[iy] ; x[ix] = y[iy];
x[ix+1] = y[iy+1] ; x[ix + 1] = y[iy + 1];
y[iy] = temp[0] ; y[iy] = temp[0];
y[iy+1] = temp[1] ; y[iy + 1] = temp[1];
ix += 2 ;
iy += 2 ;
i++ ;
}
ix += 2;
iy += 2;
i++;
} }
else
{
inc_x2 = 2 * inc_x; } else {
inc_y2 = 2 * inc_y;
while(i < n) inc_x2 = 2 * inc_x;
{ inc_y2 = 2 * inc_y;
temp[0] = x[ix] ; while (i < n) {
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;
ix += inc_x2 ; temp[0] = x[ix];
iy += inc_y2 ; temp[1] = x[ix + 1];
i++ ; x[ix] = y[iy];
x[ix + 1] = y[iy + 1];
y[iy] = temp[0];
y[iy + 1] = temp[1];
} ix += inc_x2;
iy += inc_y2;
i++;
} }
return(0);
}
return (0);
} }