Merge pull request #2012 from maamountki/z14

[ZARCH] Many improvements
This commit is contained in:
Martin Kroeker 2019-02-13 20:15:56 +01:00 committed by GitHub
commit 76bb74fcd4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
67 changed files with 13503 additions and 14618 deletions

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,27 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) {
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amax; FLOAT amax;
__asm__ volatile ( __asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v0,0(%2),0 \n\t" "vlef %%v16,4(%[x]),0\n\t"
"vlef %%v16,4(%2),0 \n\t" "vlef %%v0,8(%[x]),1\n\t"
"vlef %%v0,8(%2),1 \n\t" "vlef %%v16,12(%[x]),1\n\t"
"vlef %%v16,12(%2),1 \n\t" "vlef %%v0,16(%[x]),2\n\t"
"vlef %%v0,16(%2),2 \n\t" "vlef %%v16,20(%[x]),2\n\t"
"vlef %%v16,20(%2),2 \n\t" "vlef %%v0,24(%[x]),3\n\t"
"vlef %%v0,24(%2),3 \n\t" "vlef %%v16,28(%[x]),3\n\t"
"vlef %%v16,28(%2),3 \n\t"
"vflpsb %%v0,%%v0\n\t" "vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t" "vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t" "vfasb %%v0,%%v0,%%v16\n\t"
@ -68,51 +60,42 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
"vleib %%v1,25,13\n\t" "vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t" "vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t" "vleib %%v1,27,15\n\t"
"srlg %%r0,%1,5 \n\t" "srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v2,16(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%2) \n\t"
"vpkg %%v17,%%v16,%%v2\n\t" "vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t" "vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v2,48(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%2) \n\t"
"vpkg %%v19,%%v18,%%v2\n\t" "vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t" "vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v2,80(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%2) \n\t"
"vpkg %%v21,%%v20,%%v2\n\t" "vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t" "vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v2,112(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%2) \n\t"
"vpkg %%v23,%%v22,%%v2\n\t" "vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t" "vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%2) \n\t" "vl %%v2,144(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%2) \n\t"
"vpkg %%v25,%%v24,%%v2\n\t" "vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t" "vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%2) \n\t" "vl %%v2,176(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%2) \n\t"
"vpkg %%v27,%%v26,%%v2\n\t" "vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t" "vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vl %%v2,208(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%2) \n\t"
"vpkg %%v29,%%v28,%%v2\n\t" "vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t" "vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vl %%v2,240(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%2) \n\t"
"vpkg %%v31,%%v30,%%v2\n\t" "vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t" "vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t" "vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t" "vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t" "vflpsb %%v18,%%v18\n\t"
@ -129,7 +112,6 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
"vflpsb %%v29,%%v29\n\t" "vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t" "vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t" "vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t" "vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t" "vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t" "vfasb %%v20,%%v20,%%v21\n\t"
@ -138,32 +120,26 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
"vfasb %%v26,%%v26,%%v27\n\t" "vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t" "vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t" "vfasb %%v30,%%v30,%%v31\n\t"
"vfmaxsb %%v16,%%v16,%%v24,0\n\t" "vfmaxsb %%v16,%%v16,%%v24,0\n\t"
"vfmaxsb %%v18,%%v18,%%v26,0\n\t" "vfmaxsb %%v18,%%v18,%%v26,0\n\t"
"vfmaxsb %%v20,%%v20,%%v28,0\n\t" "vfmaxsb %%v20,%%v20,%%v28,0\n\t"
"vfmaxsb %%v22,%%v22,%%v30,0\n\t" "vfmaxsb %%v22,%%v22,%%v30,0\n\t"
"vfmaxsb %%v16,%%v16,%%v20,0\n\t" "vfmaxsb %%v16,%%v16,%%v20,0\n\t"
"vfmaxsb %%v18,%%v18,%%v22,0\n\t" "vfmaxsb %%v18,%%v18,%%v22,0\n\t"
"vfmaxsb %%v16,%%v16,%%v18,0\n\t" "vfmaxsb %%v16,%%v16,%%v18,0\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t" "vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t" "veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t" "vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t" "vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,0\n\t" "wfmaxsb %%v0,%%v0,%%v16,0\n\t"
"ler %0,%%f0 " "ler %[amax],%%f0"
:"=f"(amax) : [amax] "=f"(amax),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
); "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
return amax; return amax;
} }
@ -174,7 +150,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
@ -184,9 +161,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = camax_kernel_32(n1, x); maxf = camax_kernel_32(n1, x);
ix = n1 * 2; ix = n1 * 2;
i = n1; i = n1;
} } else {
else
{
maxf = CABS1(x, 0); maxf = CABS1(x, 0);
ix += 2; ix += 2;
i++; i++;
@ -228,7 +203,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (i < n) { while (i < n) {
if (CABS1(x, ix) > maxf) { if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix); maxf = CABS1(x, ix);

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,27 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) {
static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amin; FLOAT amin;
__asm__ volatile ( __asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v0,0(%2),0 \n\t" "vlef %%v16,4(%[x]),0\n\t"
"vlef %%v16,4(%2),0 \n\t" "vlef %%v0,8(%[x]),1\n\t"
"vlef %%v0,8(%2),1 \n\t" "vlef %%v16,12(%[x]),1\n\t"
"vlef %%v16,12(%2),1 \n\t" "vlef %%v0,16(%[x]),2\n\t"
"vlef %%v0,16(%2),2 \n\t" "vlef %%v16,20(%[x]),2\n\t"
"vlef %%v16,20(%2),2 \n\t" "vlef %%v0,24(%[x]),3\n\t"
"vlef %%v0,24(%2),3 \n\t" "vlef %%v16,28(%[x]),3\n\t"
"vlef %%v16,28(%2),3 \n\t"
"vflpsb %%v0,%%v0\n\t" "vflpsb %%v0,%%v0\n\t"
"vflpsb %%v16,%%v16\n\t" "vflpsb %%v16,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16\n\t" "vfasb %%v0,%%v0,%%v16\n\t"
@ -68,51 +60,42 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
"vleib %%v1,25,13\n\t" "vleib %%v1,25,13\n\t"
"vleib %%v1,26,14\n\t" "vleib %%v1,26,14\n\t"
"vleib %%v1,27,15\n\t" "vleib %%v1,27,15\n\t"
"srlg %%r0,%1,5 \n\t" "srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v2,16(%%r1,%[x])\n\t"
"vl %%v2,16(%%r1,%2) \n\t"
"vpkg %%v17,%%v16,%%v2\n\t" "vpkg %%v17,%%v16,%%v2\n\t"
"vperm %%v16,%%v16,%%v2,%%v1\n\t" "vperm %%v16,%%v16,%%v2,%%v1\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v2,48(%%r1,%[x])\n\t"
"vl %%v2,48(%%r1,%2) \n\t"
"vpkg %%v19,%%v18,%%v2\n\t" "vpkg %%v19,%%v18,%%v2\n\t"
"vperm %%v18,%%v18,%%v2,%%v1\n\t" "vperm %%v18,%%v18,%%v2,%%v1\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v2,80(%%r1,%[x])\n\t"
"vl %%v2,80(%%r1,%2) \n\t"
"vpkg %%v21,%%v20,%%v2\n\t" "vpkg %%v21,%%v20,%%v2\n\t"
"vperm %%v20,%%v20,%%v2,%%v1\n\t" "vperm %%v20,%%v20,%%v2,%%v1\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v2,112(%%r1,%[x])\n\t"
"vl %%v2,112(%%r1,%2) \n\t"
"vpkg %%v23,%%v22,%%v2\n\t" "vpkg %%v23,%%v22,%%v2\n\t"
"vperm %%v22,%%v22,%%v2,%%v1\n\t" "vperm %%v22,%%v22,%%v2,%%v1\n\t"
"vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%2) \n\t" "vl %%v2,144(%%r1,%[x])\n\t"
"vl %%v2,144(%%r1,%2) \n\t"
"vpkg %%v25,%%v24,%%v2\n\t" "vpkg %%v25,%%v24,%%v2\n\t"
"vperm %%v24,%%v24,%%v2,%%v1\n\t" "vperm %%v24,%%v24,%%v2,%%v1\n\t"
"vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%2) \n\t" "vl %%v2,176(%%r1,%[x])\n\t"
"vl %%v2,176(%%r1,%2) \n\t"
"vpkg %%v27,%%v26,%%v2\n\t" "vpkg %%v27,%%v26,%%v2\n\t"
"vperm %%v26,%%v26,%%v2,%%v1\n\t" "vperm %%v26,%%v26,%%v2,%%v1\n\t"
"vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vl %%v2,208(%%r1,%[x])\n\t"
"vl %%v2,208(%%r1,%2) \n\t"
"vpkg %%v29,%%v28,%%v2\n\t" "vpkg %%v29,%%v28,%%v2\n\t"
"vperm %%v28,%%v28,%%v2,%%v1\n\t" "vperm %%v28,%%v28,%%v2,%%v1\n\t"
"vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vl %%v2,240(%%r1,%[x])\n\t"
"vl %%v2,240(%%r1,%2) \n\t"
"vpkg %%v31,%%v30,%%v2\n\t" "vpkg %%v31,%%v30,%%v2\n\t"
"vperm %%v30,%%v30,%%v2,%%v1\n\t" "vperm %%v30,%%v30,%%v2,%%v1\n\t"
"vflpsb %%v16,%%v16\n\t" "vflpsb %%v16,%%v16\n\t"
"vflpsb %%v17,%%v17\n\t" "vflpsb %%v17,%%v17\n\t"
"vflpsb %%v18,%%v18\n\t" "vflpsb %%v18,%%v18\n\t"
@ -129,7 +112,6 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
"vflpsb %%v29,%%v29\n\t" "vflpsb %%v29,%%v29\n\t"
"vflpsb %%v30,%%v30\n\t" "vflpsb %%v30,%%v30\n\t"
"vflpsb %%v31,%%v31\n\t" "vflpsb %%v31,%%v31\n\t"
"vfasb %%v16,%%v16,%%v17\n\t" "vfasb %%v16,%%v16,%%v17\n\t"
"vfasb %%v18,%%v18,%%v19\n\t" "vfasb %%v18,%%v18,%%v19\n\t"
"vfasb %%v20,%%v20,%%v21\n\t" "vfasb %%v20,%%v20,%%v21\n\t"
@ -138,32 +120,26 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
"vfasb %%v26,%%v26,%%v27\n\t" "vfasb %%v26,%%v26,%%v27\n\t"
"vfasb %%v28,%%v28,%%v29\n\t" "vfasb %%v28,%%v28,%%v29\n\t"
"vfasb %%v30,%%v30,%%v31\n\t" "vfasb %%v30,%%v30,%%v31\n\t"
"vfminsb %%v16,%%v16,%%v24,0\n\t" "vfminsb %%v16,%%v16,%%v24,0\n\t"
"vfminsb %%v18,%%v18,%%v26,0\n\t" "vfminsb %%v18,%%v18,%%v26,0\n\t"
"vfminsb %%v20,%%v20,%%v28,0\n\t" "vfminsb %%v20,%%v20,%%v28,0\n\t"
"vfminsb %%v22,%%v22,%%v30,0\n\t" "vfminsb %%v22,%%v22,%%v30,0\n\t"
"vfminsb %%v16,%%v16,%%v20,0\n\t" "vfminsb %%v16,%%v16,%%v20,0\n\t"
"vfminsb %%v18,%%v18,%%v22,0\n\t" "vfminsb %%v18,%%v18,%%v22,0\n\t"
"vfminsb %%v16,%%v16,%%v18,0\n\t" "vfminsb %%v16,%%v16,%%v18,0\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t" "vfminsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t" "veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t" "vfminsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t" "vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,0\n\t" "wfminsb %%v0,%%v0,%%v16,0\n\t"
"ler %0,%%f0 " "ler %[amin],%%f0"
:"=f"(amin) : [amin] "=f"(amin),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
); "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
return amin; return amin;
} }
@ -174,7 +150,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
@ -184,9 +161,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = camin_kernel_32(n1, x); minf = camin_kernel_32(n1, x);
ix = n1 * 2; ix = n1 * 2;
i = n1; i = n1;
} } else {
else
{
minf = CABS1(x, 0); minf = CABS1(x, 0);
ix += 2; ix += 2;
i++; i++;
@ -228,7 +203,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (i < n) { while (i < n) {
if (CABS1(x, ix) < minf) { if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix); minf = CABS1(x, ix);

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,34 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif
static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) {
{
FLOAT asum; FLOAT asum;
__asm__ ( __asm__("vzero %%v24\n\t"
"vzero %%v0 \n\t" "vzero %%v25\n\t"
"vzero %%v1 \n\t" "vzero %%v26\n\t"
"vzero %%v2 \n\t" "vzero %%v27\n\t"
"vzero %%v3 \n\t" "vzero %%v28\n\t"
"srlg %%r0,%1,5 \n\t" "vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%2) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%2) \n\t" "vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%2) \n\t" "vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%2) \n\t" "vl %%v23, 112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t" "vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t" "vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t" "vflpsb %%v18, %%v18\n\t"
@ -64,25 +61,22 @@ static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
"vflpsb %%v21, %%v21\n\t" "vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t" "vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t" "vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16 \n\t" "vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v1,%%v1,%%v17 \n\t" "vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v2,%%v2,%%v18 \n\t" "vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v3,%%v3,%%v19 \n\t" "vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v0,%%v0,%%v20 \n\t" "vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v1,%%v1,%%v21 \n\t" "vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v2,%%v2,%%v22 \n\t" "vfasb %%v31,%%v31,%%v23\n\t"
"vfasb %%v3,%%v3,%%v23 \n\t" "vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%2) \n\t" "vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%2) \n\t" "vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%2) \n\t" "vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%2) \n\t" "vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%2) \n\t" "vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%2) \n\t" "vl %%v23, 240(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16\n\t" "vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t" "vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t" "vflpsb %%v18, %%v18\n\t"
@ -91,70 +85,66 @@ static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
"vflpsb %%v21, %%v21\n\t" "vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t" "vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t" "vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16 \n\t" "vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v1,%%v1,%%v17 \n\t" "vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v2,%%v2,%%v18 \n\t" "vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v3,%%v3,%%v19 \n\t" "vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v0,%%v0,%%v20 \n\t" "vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v1,%%v1,%%v21 \n\t" "vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v2,%%v2,%%v22 \n\t" "vfasb %%v31,%%v31,%%v23\n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfasb %%v0,%%v0,%%v1 \n\t" "vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v0,%%v0,%%v2 \n\t" "vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v0,%%v0,%%v3 \n\t" "vfasb %%v24,%%v24,%%v27\n\t"
"veslg %%v1,%%v0,32 \n\t" "vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v0,%%v0,%%v1 \n\t" "vfasb %%v24,%%v24,%%v29\n\t"
"vrepf %%v1,%%v0,2 \n\t" "vfasb %%v24,%%v24,%%v30\n\t"
"aebr %%f0,%%f1 \n\t" "vfasb %%v24,%%v24,%%v31\n\t"
"ler %0,%%f0 " "veslg %%v25,%%v24,32\n\t"
:"=f"(asum) "vfasb %%v24,%%v24,%%v25\n\t"
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x) "vrepf %%v25,%%v24,2\n\t"
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" "vfasb %%v24,%%v24,%%v25\n\t"
); "vstef %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum; return asum;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
{
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ip = 0; BLASLONG ip = 0;
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
BLASLONG n1; BLASLONG n1;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf); if (n <= 0 || inc_x <= 0)
return (sumf);
if ( inc_x == 1 ) if (inc_x == 1) {
{
n1 = n & -32; n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{
sumf = casum_kernel_32(n1, x); sumf = casum_kernel_32(n1, x);
i = n1; i = n1;
ip = 2 * n1; ip = 2 * n1;
} }
while(i < n) while (i < n) {
{
sumf += ABS(x[ip]) + ABS(x[ip + 1]); sumf += ABS(x[ip]) + ABS(x[ip + 1]);
i++; i++;
ip += 2; ip += 2;
} }
} } else {
else
{
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
while(i < n) while (i < n) {
{
sumf += ABS(x[ip]) + ABS(x[ip + 1]); sumf += ABS(x[ip]) + ABS(x[ip + 1]);
ip += inc_x2; ip += inc_x2;
i++; i++;
@ -163,5 +153,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
} }
return (sumf); return (sumf);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,100 +27,95 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
{ __asm__(
__asm__ volatile(
#if !defined(CONJ) #if !defined(CONJ)
"vlrepf %%v0,0(%3) \n\t" "vlrepf %%v0,0(%[alpha])\n\t"
"vlef %%v1,4(%3),0 \n\t" "vlef %%v1,4(%[alpha]),0\n\t"
"vlef %%v1,4(%3),2 \n\t" "vlef %%v1,4(%[alpha]),2\n\t"
"vflcsb %%v1,%%v1\n\t" "vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%3),1 \n\t" "vlef %%v1,4(%[alpha]),1\n\t"
"vlef %%v1,4(%3),3 \n\t" "vlef %%v1,4(%[alpha]),3\n\t"
#else #else
"vlef %%v0,0(%3),1 \n\t" "vlef %%v0,0(%[alpha]),1\n\t"
"vlef %%v0,0(%3),3 \n\t" "vlef %%v0,0(%[alpha]),3\n\t"
"vflcsb %%v0,%%v0\n\t" "vflcsb %%v0,%%v0\n\t"
"vlef %%v0,0(%3),0 \n\t" "vlef %%v0,0(%[alpha]),0\n\t"
"vlef %%v0,0(%3),2 \n\t" "vlef %%v0,0(%[alpha]),2\n\t"
"vlrepf %%v1,4(%3) \n\t" "vlrepf %%v1,4(%[alpha])\n\t"
#endif #endif
"srlg %%r0,%0,4 \n\t" "srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v8,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v9,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%1) \n\t" "vl %%v10,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%1) \n\t" "vl %%v11,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%1) \n\t" "vl %%v12,0(%%r1,%[y])\n\t"
"vl %%v20,0(%%r1,%2) \n\t" "vl %%v13,16(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%2) \n\t" "vl %%v14,32(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%2) \n\t" "vl %%v15,48(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%2) \n\t" "vl %%v16,64(%%r1,%[x])\n\t"
"verllg %%v24,%%v16,32 \n\t" "vl %%v17,80(%%r1,%[x])\n\t"
"verllg %%v25,%%v17,32 \n\t" "vl %%v18,96(%%r1,%[x])\n\t"
"verllg %%v26,%%v18,32 \n\t" "vl %%v19,112(%%r1,%[x])\n\t"
"verllg %%v27,%%v19,32 \n\t" "vl %%v20,64(%%r1,%[y])\n\t"
"vl %%v21,80(%%r1,%[y])\n\t"
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" "vl %%v22,96(%%r1,%[y])\n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" "vl %%v23,112(%%r1,%[y])\n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" "verllg %%v24,%%v8,32\n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" "verllg %%v25,%%v9,32\n\t"
"verllg %%v26,%%v10,32\n\t"
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" "verllg %%v27,%%v11,32\n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" "verllg %%v28,%%v16,32\n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" "verllg %%v29,%%v17,32\n\t"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" "verllg %%v30,%%v18,32\n\t"
"verllg %%v31,%%v19,32\n\t"
"vst %%v28,0(%%r1,%2) \n\t" "vfmasb %%v8,%%v8,%%v0,%%v12\n\t"
"vst %%v29,16(%%r1,%2) \n\t" "vfmasb %%v9,%%v9,%%v0,%%v13\n\t"
"vst %%v30,32(%%r1,%2) \n\t" "vfmasb %%v10,%%v10,%%v0,%%v14\n\t"
"vst %%v31,48(%%r1,%2) \n\t" "vfmasb %%v11,%%v11,%%v0,%%v15\n\t"
"vfmasb %%v16,%%v16,%%v0,%%v20\n\t"
"vl %%v16,64(%%r1,%1) \n\t" "vfmasb %%v17,%%v17,%%v0,%%v21\n\t"
"vl %%v17,80(%%r1,%1) \n\t" "vfmasb %%v18,%%v18,%%v0,%%v22\n\t"
"vl %%v18,96(%%r1,%1) \n\t" "vfmasb %%v19,%%v19,%%v0,%%v23\n\t"
"vl %%v19,112(%%r1,%1) \n\t" "vfmasb %%v8,%%v24,%%v1,%%v8\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vfmasb %%v9,%%v25,%%v1,%%v9\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vfmasb %%v10,%%v26,%%v1,%%v10\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vfmasb %%v11,%%v27,%%v1,%%v11\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vfmasb %%v16,%%v28,%%v1,%%v16\n\t"
"verllg %%v24,%%v16,32 \n\t" "vfmasb %%v17,%%v29,%%v1,%%v17\n\t"
"verllg %%v25,%%v17,32 \n\t" "vfmasb %%v18,%%v30,%%v1,%%v18\n\t"
"verllg %%v26,%%v18,32 \n\t" "vfmasb %%v19,%%v31,%%v1,%%v19\n\t"
"verllg %%v27,%%v19,32 \n\t" "vst %%v8,0(%%r1,%[y])\n\t"
"vst %%v9,16(%%r1,%[y])\n\t"
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" "vst %%v10,32(%%r1,%[y])\n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" "vst %%v11,48(%%r1,%[y])\n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" "vst %%v16,64(%%r1,%[y])\n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" "vst %%v17,80(%%r1,%[y])\n\t"
"vst %%v18,96(%%r1,%[y])\n\t"
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" "vst %%v19,112(%%r1,%[y])\n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"
"vst %%v28,64(%%r1,%2) \n\t"
"vst %%v29,80(%%r1,%2) \n\t"
"vst %%v30,96(%%r1,%2) \n\t"
"vst %%v31,112(%%r1,%2) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
); : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
FLOAT da[2] __attribute__ ((aligned(16))); FLOAT da[2] __attribute__ ((aligned(16)));
if (n <= 0) return (0); if (n <= 0)
return (0);
if ((inc_x == 1) && (inc_y == 1)) { if ((inc_x == 1) && (inc_y == 1)) {
@ -147,7 +142,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
} }
return (0); return (0);
} }
inc_x *= 2; inc_x *= 2;
@ -170,5 +164,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
return (0); return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,46 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],5\n\t"
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,5 \n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1) \n\t" "pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%%r2) \n\t" "pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t" "mvc 0(256,%[y]),0(%[x])\n\t"
"agfi %%r1,256 \n\t" "la %[x],256(%[x])\n\t"
"agfi %%r2,256 \n\t" "la %[y],256(%[y])\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) [n] "+&r"(n)
:"memory","cc","r0","r1","r2" : "m"(*(const struct { FLOAT x[n * 2]; } *) x)
); : "cc");
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
{
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 )) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{
ccopy_kernel_32(n1, x, y); ccopy_kernel_32(n1, x, y);
i = n1; i = n1;
ix = n1 * 2; ix = n1 * 2;
iy = n1 * 2; iy = n1 * 2;
} }
while(i < n) while (i < n) {
{
y[iy] = x[iy]; y[iy] = x[iy];
y[iy + 1] = x[ix + 1]; y[iy + 1] = x[ix + 1];
ix += 2; ix += 2;
@ -75,16 +68,12 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
} }
} else {
}
else
{
BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y; BLASLONG inc_y2 = 2 * inc_y;
while(i < n) while (i < n) {
{
y[iy] = x[ix]; y[iy] = x[ix];
y[iy + 1] = x[ix + 1]; y[iy + 1] = x[ix + 1];
ix += inc_x2; ix += inc_x2;

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,10 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
{ __asm__("vzero %%v24\n\t"
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25\n\t" "vzero %%v25\n\t"
"vzero %%v26\n\t" "vzero %%v26\n\t"
"vzero %%v27\n\t" "vzero %%v27\n\t"
@ -38,25 +36,23 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vzero %%v29\n\t" "vzero %%v29\n\t"
"vzero %%v30\n\t" "vzero %%v30\n\t"
"vzero %%v31\n\t" "vzero %%v31\n\t"
"srlg %%r0,%0,4 \n\t" "srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v0, 0(%%r1,%2) \n\t" "vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%2) \n\t" "vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%2) \n\t" "vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"verllg %%v20,%%v16,32\n\t" "verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t" "verllg %%v21,%%v17,32\n\t"
"verllg %%v22,%%v18,32\n\t" "verllg %%v22,%%v18,32\n\t"
"verllg %%v23,%%v19,32\n\t" "verllg %%v23,%%v19,32\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t" "vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t" "vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t" "vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
@ -65,20 +61,18 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t" "vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t" "vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t" "vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"vl %%v16, 64(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%1) \n\t" "vl %%v17, 80(%%r1,%[x])\n\t"
"vl %%v17, 80(%%r1,%1) \n\t" "vl %%v18, 96(%%r1,%[x])\n\t"
"vl %%v18, 96(%%r1,%1) \n\t" "vl %%v19, 112(%%r1,%[x])\n\t"
"vl %%v19, 112(%%r1,%1) \n\t" "vl %%v0, 64(%%r1,%[y])\n\t"
"vl %%v0, 64(%%r1,%2) \n\t" "vl %%v1, 80(%%r1,%[y])\n\t"
"vl %%v1, 80(%%r1,%2) \n\t" "vl %%v2, 96(%%r1,%[y])\n\t"
"vl %%v2, 96(%%r1,%2) \n\t" "vl %%v3, 112(%%r1,%[y])\n\t"
"vl %%v3, 112(%%r1,%2) \n\t"
"verllg %%v20,%%v16,32\n\t" "verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t" "verllg %%v21,%%v17,32\n\t"
"verllg %%v22,%%v18,32\n\t" "verllg %%v22,%%v18,32\n\t"
"verllg %%v23,%%v19,32\n\t" "verllg %%v23,%%v19,32\n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t" "vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t" "vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t" "vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
@ -87,9 +81,8 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t" "vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t" "vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t" "vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v26\n\t" "vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v28\n\t" "vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v30\n\t" "vfasb %%v24,%%v24,%%v30\n\t"
@ -100,21 +93,25 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vfasb %%v25,%%v25,%%v31\n\t" "vfasb %%v25,%%v25,%%v31\n\t"
"vrepg %%v27,%%v25,1\n\t" "vrepg %%v27,%%v25,1\n\t"
"vfasb %%v25,%%v25,%%v27\n\t" "vfasb %%v25,%%v25,%%v27\n\t"
"vstef %%v24,0(%3),0 \n\t" "vstef %%v24,0(%[d]),0\n\t"
"vstef %%v24,4(%3),1 \n\t" "vstef %%v24,4(%[d]),1\n\t"
"vstef %%v25,8(%3),1 \n\t" "vstef %%v25,8(%[d]),1\n\t"
"vstef %%v25,12(%3),0 " "vstef %%v25,12(%[d]),0"
: : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
); : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y) {
BLASLONG i; BLASLONG i;
BLASLONG ix, iy; BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result; OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; FLOAT dot[4] __attribute__ ((aligned(16))) = {
0.0, 0.0, 0.0, 0.0};
if (n <= 0) { if (n <= 0) {
CREAL(result) = 0.0; CREAL(result) = 0.0;
@ -145,7 +142,6 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
} }
} else { } else {
i = 0; i = 0;
ix = 0; ix = 0;
@ -178,5 +174,3 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
return (result); return (result);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2014, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -25,304 +25,347 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "common.h" #include "common.h"
#define NBMAX 2048 #define NBMAX 2048
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
{ register FLOAT *ap0 = ap[0];
__asm__ volatile ( register FLOAT *ap1 = ap[1];
"vlrepg %%v16,0(%5) \n\t" register FLOAT *ap2 = ap[2];
"vlrepg %%v17,8(%5) \n\t" register FLOAT *ap3 = ap[3];
"vlrepg %%v18,16(%5) \n\t"
"vlrepg %%v19,24(%5) \n\t" __asm__("vlrepg %%v16,0(%[x])\n\t"
"vlrepg %%v17,8(%[x])\n\t"
"vlrepg %%v18,16(%[x])\n\t"
"vlrepg %%v19,24(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v20,4(%5),0 \n\t" "vlef %%v20,4(%[x]),0\n\t"
"vlef %%v20,4(%5),2 \n\t" "vlef %%v20,4(%[x]),2\n\t"
"vflcsb %%v20,%%v20\n\t" "vflcsb %%v20,%%v20\n\t"
"vlef %%v20,0(%5),1 \n\t" "vlef %%v20,0(%[x]),1\n\t"
"vlef %%v20,0(%5),3 \n\t" "vlef %%v20,0(%[x]),3\n\t"
"vlef %%v21,12(%[x]),0\n\t"
"vlef %%v21,12(%5),0 \n\t" "vlef %%v21,12(%[x]),2\n\t"
"vlef %%v21,12(%5),2 \n\t"
"vflcsb %%v21,%%v21\n\t" "vflcsb %%v21,%%v21\n\t"
"vlef %%v21,8(%5),1 \n\t" "vlef %%v21,8(%[x]),1\n\t"
"vlef %%v21,8(%5),3 \n\t" "vlef %%v21,8(%[x]),3\n\t"
"vlef %%v22,20(%[x]),0\n\t"
"vlef %%v22,20(%5),0 \n\t" "vlef %%v22,20(%[x]),2\n\t"
"vlef %%v22,20(%5),2 \n\t"
"vflcsb %%v22,%%v22\n\t" "vflcsb %%v22,%%v22\n\t"
"vlef %%v22,16(%5),1 \n\t" "vlef %%v22,16(%[x]),1\n\t"
"vlef %%v22,16(%5),3 \n\t" "vlef %%v22,16(%[x]),3\n\t"
"vlef %%v23,28(%[x]),0\n\t"
"vlef %%v23,28(%5),0 \n\t" "vlef %%v23,28(%[x]),2\n\t"
"vlef %%v23,28(%5),2 \n\t"
"vflcsb %%v23,%%v23\n\t" "vflcsb %%v23,%%v23\n\t"
"vlef %%v23,24(%5),1 \n\t" "vlef %%v23,24(%[x]),1\n\t"
"vlef %%v23,24(%5),3 \n\t" "vlef %%v23,24(%[x]),3\n\t"
#else #else
"vlef %%v20,0(%5),1 \n\t" "vlef %%v20,0(%[x]),1\n\t"
"vlef %%v20,0(%5),3 \n\t" "vlef %%v20,0(%[x]),3\n\t"
"vflcsb %%v20,%%v20\n\t" "vflcsb %%v20,%%v20\n\t"
"vlef %%v20,4(%5),0 \n\t" "vlef %%v20,4(%[x]),0\n\t"
"vlef %%v20,4(%5),2 \n\t" "vlef %%v20,4(%[x]),2\n\t"
"vlef %%v21,8(%[x]),1\n\t"
"vlef %%v21,8(%5),1 \n\t" "vlef %%v21,8(%[x]),3\n\t"
"vlef %%v21,8(%5),3 \n\t"
"vflcsb %%v21,%%v21\n\t" "vflcsb %%v21,%%v21\n\t"
"vlef %%v21,12(%5),0 \n\t" "vlef %%v21,12(%[x]),0\n\t"
"vlef %%v21,12(%5),2 \n\t" "vlef %%v21,12(%[x]),2\n\t"
"vlef %%v22,16(%[x]),1\n\t"
"vlef %%v22,16(%5),1 \n\t" "vlef %%v22,16(%[x]),3\n\t"
"vlef %%v22,16(%5),3 \n\t"
"vflcsb %%v22,%%v22\n\t" "vflcsb %%v22,%%v22\n\t"
"vlef %%v22,20(%5),0 \n\t" "vlef %%v22,20(%[x]),0\n\t"
"vlef %%v22,20(%5),2 \n\t" "vlef %%v22,20(%[x]),2\n\t"
"vlef %%v23,24(%[x]),1\n\t"
"vlef %%v23,24(%5),1 \n\t" "vlef %%v23,24(%[x]),3\n\t"
"vlef %%v23,24(%5),3 \n\t"
"vflcsb %%v23,%%v23\n\t" "vflcsb %%v23,%%v23\n\t"
"vlef %%v23,28(%5),0 \n\t" "vlef %%v23,28(%[x]),0\n\t"
"vlef %%v23,28(%5),2 \n\t" "vlef %%v23,28(%[x]),2\n\t"
#endif #endif
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t" "srlg %[n],%[n],1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%4) \n\t" "pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%6) \n\t" "pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vlef %%v24,0(%%r1,%1),0 \n\t" "vperm %%v25,%%v24,%%v24,%%v2\n\t"
"vlef %%v24,0(%%r1,%1),1 \n\t" "vperm %%v24,%%v24,%%v24,%%v1\n\t"
"vlef %%v24,8(%%r1,%1),2 \n\t" "vl %%v26,0(%%r1,%[ap1])\n\t"
"vlef %%v24,8(%%r1,%1),3 \n\t" "vperm %%v27,%%v26,%%v26,%%v2\n\t"
"vlef %%v25,4(%%r1,%1),0 \n\t" "vperm %%v26,%%v26,%%v26,%%v1\n\t"
"vlef %%v25,4(%%r1,%1),1 \n\t" "vl %%v0,0(%%r1,%[y])\n\t"
"vlef %%v25,12(%%r1,%1),2 \n\t"
"vlef %%v25,12(%%r1,%1),3 \n\t"
"vlef %%v26,0(%%r1,%2),0 \n\t"
"vlef %%v26,0(%%r1,%2),1 \n\t"
"vlef %%v26,8(%%r1,%2),2 \n\t"
"vlef %%v26,8(%%r1,%2),3 \n\t"
"vlef %%v27,4(%%r1,%2),0 \n\t"
"vlef %%v27,4(%%r1,%2),1 \n\t"
"vlef %%v27,12(%%r1,%2),2 \n\t"
"vlef %%v27,12(%%r1,%2),3 \n\t"
"vl %%v0,0(%%r1,%6) \n\t"
"vfmasb %%v0,%%v24,%%v16,%%v0\n\t" "vfmasb %%v0,%%v24,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v25,%%v20,%%v0\n\t" "vfmasb %%v0,%%v25,%%v20,%%v0\n\t"
"vfmasb %%v0,%%v26,%%v17,%%v0\n\t" "vfmasb %%v0,%%v26,%%v17,%%v0\n\t"
"vfmasb %%v0,%%v27,%%v21,%%v0\n\t" "vfmasb %%v0,%%v27,%%v21,%%v0\n\t"
"vl %%v28,0(%%r1,%[ap2])\n\t"
"vlef %%v28,0(%%r1,%3),0 \n\t" "vperm %%v29,%%v28,%%v28,%%v2\n\t"
"vlef %%v28,0(%%r1,%3),1 \n\t" "vperm %%v28,%%v28,%%v28,%%v1\n\t"
"vlef %%v28,8(%%r1,%3),2 \n\t" "vl %%v30,0(%%r1,%[ap3])\n\t"
"vlef %%v28,8(%%r1,%3),3 \n\t" "vperm %%v31,%%v30,%%v30,%%v2\n\t"
"vlef %%v29,4(%%r1,%3),0 \n\t" "vperm %%v30,%%v30,%%v30,%%v1\n\t"
"vlef %%v29,4(%%r1,%3),1 \n\t"
"vlef %%v29,12(%%r1,%3),2 \n\t"
"vlef %%v29,12(%%r1,%3),3 \n\t"
"vlef %%v30,0(%%r1,%4),0 \n\t"
"vlef %%v30,0(%%r1,%4),1 \n\t"
"vlef %%v30,8(%%r1,%4),2 \n\t"
"vlef %%v30,8(%%r1,%4),3 \n\t"
"vlef %%v31,4(%%r1,%4),0 \n\t"
"vlef %%v31,4(%%r1,%4),1 \n\t"
"vlef %%v31,12(%%r1,%4),2 \n\t"
"vlef %%v31,12(%%r1,%4),3 \n\t"
"vfmasb %%v0,%%v28,%%v18,%%v0\n\t" "vfmasb %%v0,%%v28,%%v18,%%v0\n\t"
"vfmasb %%v0,%%v29,%%v22,%%v0\n\t" "vfmasb %%v0,%%v29,%%v22,%%v0\n\t"
"vfmasb %%v0,%%v30,%%v19,%%v0\n\t" "vfmasb %%v0,%%v30,%%v19,%%v0\n\t"
"vfmasb %%v0,%%v31,%%v23,%%v0\n\t" "vfmasb %%v0,%%v31,%%v23,%%v0\n\t"
"vst %%v0,0(%%r1,%6) \n\t" "vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t" "agfi %%r1,16\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
); "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
{ register FLOAT *ap0 = ap[0];
__asm__ volatile ( register FLOAT *ap1 = ap[1];
"vlrepg %%v16,0(%3) \n\t"
"vlrepg %%v17,8(%3) \n\t" __asm__("vlrepg %%v16,0(%[x])\n\t"
"vlrepg %%v17,8(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v18,4(%3),0 \n\t" "vlef %%v18,4(%[x]),0\n\t"
"vlef %%v18,4(%3),2 \n\t" "vlef %%v18,4(%[x]),2\n\t"
"vflcsb %%v18,%%v18\n\t" "vflcsb %%v18,%%v18\n\t"
"vlef %%v18,0(%3),1 \n\t" "vlef %%v18,0(%[x]),1\n\t"
"vlef %%v18,0(%3),3 \n\t" "vlef %%v18,0(%[x]),3\n\t"
"vlef %%v19,12(%[x]),0\n\t"
"vlef %%v19,12(%3),0 \n\t" "vlef %%v19,12(%[x]),2\n\t"
"vlef %%v19,12(%3),2 \n\t"
"vflcsb %%v19,%%v19\n\t" "vflcsb %%v19,%%v19\n\t"
"vlef %%v19,8(%3),1 \n\t" "vlef %%v19,8(%[x]),1\n\t"
"vlef %%v19,8(%3),3 \n\t" "vlef %%v19,8(%[x]),3\n\t"
#else #else
"vlef %%v18,0(%3),1 \n\t" "vlef %%v18,0(%[x]),1\n\t"
"vlef %%v18,0(%3),3 \n\t" "vlef %%v18,0(%[x]),3\n\t"
"vflcsb %%v18,%%v18\n\t" "vflcsb %%v18,%%v18\n\t"
"vlef %%v18,4(%3),0 \n\t" "vlef %%v18,4(%[x]),0\n\t"
"vlef %%v18,4(%3),2 \n\t" "vlef %%v18,4(%[x]),2\n\t"
"vlef %%v19,8(%[x]),1\n\t"
"vlef %%v19,8(%3),1 \n\t" "vlef %%v19,8(%[x]),3\n\t"
"vlef %%v19,8(%3),3 \n\t"
"vflcsb %%v19,%%v19\n\t" "vflcsb %%v19,%%v19\n\t"
"vlef %%v19,12(%3),0 \n\t" "vlef %%v19,12(%[x]),0\n\t"
"vlef %%v19,12(%3),2 \n\t" "vlef %%v19,12(%[x]),2\n\t"
#endif #endif
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t" "srlg %[n],%[n],1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%4) \n\t" "pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v20,0(%%r1,%[ap0])\n\t"
"vlef %%v20,0(%%r1,%1),0 \n\t" "vperm %%v21,%%v20,%%v20,%%v2\n\t"
"vlef %%v20,0(%%r1,%1),1 \n\t" "vperm %%v20,%%v20,%%v20,%%v1\n\t"
"vlef %%v20,8(%%r1,%1),2 \n\t" "vl %%v22,0(%%r1,%[ap1])\n\t"
"vlef %%v20,8(%%r1,%1),3 \n\t" "vperm %%v23,%%v22,%%v22,%%v2\n\t"
"vlef %%v21,4(%%r1,%1),0 \n\t" "vperm %%v22,%%v22,%%v22,%%v1\n\t"
"vlef %%v21,4(%%r1,%1),1 \n\t" "vl %%v0,0(%%r1,%[y])\n\t"
"vlef %%v21,12(%%r1,%1),2 \n\t"
"vlef %%v21,12(%%r1,%1),3 \n\t"
"vlef %%v22,0(%%r1,%2),0 \n\t"
"vlef %%v22,0(%%r1,%2),1 \n\t"
"vlef %%v22,8(%%r1,%2),2 \n\t"
"vlef %%v22,8(%%r1,%2),3 \n\t"
"vlef %%v23,4(%%r1,%2),0 \n\t"
"vlef %%v23,4(%%r1,%2),1 \n\t"
"vlef %%v23,12(%%r1,%2),2 \n\t"
"vlef %%v23,12(%%r1,%2),3 \n\t"
"vl %%v0,0(%%r1,%4) \n\t"
"vfmasb %%v0,%%v20,%%v16,%%v0\n\t" "vfmasb %%v0,%%v20,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v21,%%v18,%%v0\n\t" "vfmasb %%v0,%%v21,%%v18,%%v0\n\t"
"vfmasb %%v0,%%v22,%%v17,%%v0\n\t" "vfmasb %%v0,%%v22,%%v17,%%v0\n\t"
"vfmasb %%v0,%%v23,%%v19,%%v0\n\t" "vfmasb %%v0,%%v23,%%v19,%%v0\n\t"
"vst %%v0,0(%%r1,%4) \n\t" "vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t" "agfi %%r1,16\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
); "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23");
} }
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
{ __asm__("vlrepg %%v16,0(%[x])\n\t"
__asm__ volatile (
"vlrepg %%v16,0(%2) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v17,4(%2),0 \n\t" "vlef %%v17,4(%[x]),0\n\t"
"vlef %%v17,4(%2),2 \n\t" "vlef %%v17,4(%[x]),2\n\t"
"vflcsb %%v17,%%v17\n\t" "vflcsb %%v17,%%v17\n\t"
"vlef %%v17,0(%2),1 \n\t" "vlef %%v17,0(%[x]),1\n\t"
"vlef %%v17,0(%2),3 \n\t" "vlef %%v17,0(%[x]),3\n\t"
#else #else
"vlef %%v17,0(%2),1 \n\t" "vlef %%v17,0(%[x]),1\n\t"
"vlef %%v17,0(%2),3 \n\t" "vlef %%v17,0(%[x]),3\n\t"
"vflcsb %%v17,%%v17\n\t" "vflcsb %%v17,%%v17\n\t"
"vlef %%v17,4(%2),0 \n\t" "vlef %%v17,4(%[x]),0\n\t"
"vlef %%v17,4(%2),2 \n\t" "vlef %%v17,4(%[x]),2\n\t"
#endif #endif
"vleib %%v1,0,0\n\t"
"vleib %%v1,1,1\n\t"
"vleib %%v1,2,2\n\t"
"vleib %%v1,3,3\n\t"
"vleib %%v1,0,4\n\t"
"vleib %%v1,1,5\n\t"
"vleib %%v1,2,6\n\t"
"vleib %%v1,3,7\n\t"
"vleib %%v1,8,8\n\t"
"vleib %%v1,9,9\n\t"
"vleib %%v1,10,10\n\t"
"vleib %%v1,11,11\n\t"
"vleib %%v1,8,12\n\t"
"vleib %%v1,9,13\n\t"
"vleib %%v1,10,14\n\t"
"vleib %%v1,11,15\n\t"
"vleib %%v2,4,0\n\t"
"vleib %%v2,5,1\n\t"
"vleib %%v2,6,2\n\t"
"vleib %%v2,7,3\n\t"
"vleib %%v2,4,4\n\t"
"vleib %%v2,5,5\n\t"
"vleib %%v2,6,6\n\t"
"vleib %%v2,7,7\n\t"
"vleib %%v2,12,8\n\t"
"vleib %%v2,13,9\n\t"
"vleib %%v2,14,10\n\t"
"vleib %%v2,15,11\n\t"
"vleib %%v2,12,12\n\t"
"vleib %%v2,13,13\n\t"
"vleib %%v2,14,14\n\t"
"vleib %%v2,15,15\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t" "srlg %[n],%[n],1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 2,1024(%%r1,%3) \n\t" "pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v18,0(%%r1,%[ap])\n\t"
"vlef %%v18,0(%%r1,%1),0 \n\t" "vperm %%v19,%%v18,%%v18,%%v2\n\t"
"vlef %%v18,0(%%r1,%1),1 \n\t" "vperm %%v18,%%v18,%%v18,%%v1\n\t"
"vlef %%v18,8(%%r1,%1),2 \n\t" "vl %%v0,0(%%r1,%[y])\n\t"
"vlef %%v18,8(%%r1,%1),3 \n\t"
"vlef %%v19,4(%%r1,%1),0 \n\t"
"vlef %%v19,4(%%r1,%1),1 \n\t"
"vlef %%v19,12(%%r1,%1),2 \n\t"
"vlef %%v19,12(%%r1,%1),3 \n\t"
"vl %%v0,0(%%r1,%3) \n\t"
"vfmasb %%v0,%%v18,%%v16,%%v0\n\t" "vfmasb %%v0,%%v18,%%v16,%%v0\n\t"
"vfmasb %%v0,%%v19,%%v17,%%v0\n\t" "vfmasb %%v0,%%v19,%%v17,%%v0\n\t"
"vst %%v0,0(%%r1,%3) \n\t" "vst %%v0,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t" "agfi %%r1,16\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
:"memory","cc","r0","r1","v0","v16","v17","v18","v19" "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x)
); : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19");
} }
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r,
{ FLOAT alpha_i) {
__asm__ volatile ( __asm__(
#if !defined(XCONJ) #if !defined(XCONJ)
"vlrepf %%v0,%3 \n\t" "vlrepf %%v0,%[alpha_r]\n\t"
"vlef %%v1,%4,0 \n\t" "vlef %%v1,%[alpha_i],0\n\t"
"vlef %%v1,%4,2 \n\t" "vlef %%v1,%[alpha_i],2\n\t"
"vflcsb %%v1,%%v1\n\t" "vflcsb %%v1,%%v1\n\t"
"vlef %%v1,%4,1 \n\t" "vlef %%v1,%[alpha_i],1\n\t"
"vlef %%v1,%4,3 \n\t" "vlef %%v1,%[alpha_i],3\n\t"
#else #else
"vlef %%v0,%3,1 \n\t" "vlef %%v0,%[alpha_r],1\n\t"
"vlef %%v0,%3,3 \n\t" "vlef %%v0,%[alpha_r],3\n\t"
"vflcsb %%v0,%%v0\n\t" "vflcsb %%v0,%%v0\n\t"
"vlef %%v0,%3,0 \n\t" "vlef %%v0,%[alpha_r],0\n\t"
"vlef %%v0,%3,2 \n\t" "vlef %%v0,%[alpha_r],2\n\t"
"vlrepf %%v1,%4 \n\t" "vlrepf %%v1,%[alpha_i]\n\t"
#endif #endif
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,2 \n\t" "srlg %[n],%[n],2\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[src])\n\t"
"pfd 2,1024(%%r1,%2) \n\t" "pfd 2,1024(%%r1,%[dest])\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%1) \n\t" "vl %%v18,0(%%r1,%[dest])\n\t"
"vl %%v18,0(%%r1,%2) \n\t" "vl %%v19,16(%%r1,%[dest])\n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"verllg %%v20,%%v16,32\n\t" "verllg %%v20,%%v16,32\n\t"
"verllg %%v21,%%v17,32\n\t" "verllg %%v21,%%v17,32\n\t"
"vfmasb %%v22,%%v16,%%v0,%%v18\n\t" "vfmasb %%v22,%%v16,%%v0,%%v18\n\t"
"vfmasb %%v23,%%v17,%%v0,%%v19\n\t" "vfmasb %%v23,%%v17,%%v0,%%v19\n\t"
"vfmasb %%v22,%%v20,%%v1,%%v22\n\t" "vfmasb %%v22,%%v20,%%v1,%%v22\n\t"
"vfmasb %%v23,%%v21,%%v1,%%v23\n\t" "vfmasb %%v23,%%v21,%%v1,%%v23\n\t"
"vst %%v22,0(%%r1,%[dest])\n\t"
"vst %%v22,0(%%r1,%2) \n\t" "vst %%v23,16(%%r1,%[dest])\n\t"
"vst %%v23,16(%%r1,%2) \n\t"
"agfi %%r1,32\n\t" "agfi %%r1,32\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src),
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23" [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i)
); : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23");
} }
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,
{ FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i; BLASLONG i;
if ( inc_dest != 2 ) if (inc_dest != 2) {
{
FLOAT temp_r; FLOAT temp_r;
FLOAT temp_i; FLOAT temp_i;
for ( i=0; i<n; i++ ) for (i = 0; i < n; i++) {
{
#if !defined(XCONJ) #if !defined(XCONJ)
temp_r = alpha_r * src[0] - alpha_i * src[1]; temp_r = alpha_r * src[0] - alpha_i * src[1];
temp_i = alpha_r * src[1] + alpha_i * src[0]; temp_i = alpha_r * src[1] + alpha_i * src[0];
@ -343,8 +386,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
add_y_4(n, src, dest, alpha_r, alpha_i); add_y_4(n, src, dest, alpha_r, alpha_i);
} }
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
{ FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i; BLASLONG i;
FLOAT *a_ptr; FLOAT *a_ptr;
FLOAT *x_ptr; FLOAT *x_ptr;
@ -358,8 +402,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
BLASLONG lda4; BLASLONG lda4;
FLOAT xbuffer[8], *ybuffer; FLOAT xbuffer[8], *ybuffer;
if ( m < 1 ) return(0); if (m < 1)
if ( n < 1 ) return(0); return (0);
if (n < 1)
return (0);
ybuffer = buffer; ybuffer = buffer;
@ -379,13 +425,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
BLASLONG NB = NBMAX; BLASLONG NB = NBMAX;
while ( NB == NBMAX ) while (NB == NBMAX) {
{
m1 -= NB; m1 -= NB;
if ( m1 < 0) if (m1 < 0) {
{ if (m2 == 0)
if ( m2 == 0 ) break; break;
NB = m2; NB = m2;
} }
@ -398,11 +443,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
//zero_y(NB,ybuffer); //zero_y(NB,ybuffer);
memset(ybuffer, 0, NB * 8); memset(ybuffer, 0, NB * 8);
if ( inc_x == 2 ) if (inc_x == 2) {
{
for( i = 0; i < n1 ; i++) for (i = 0; i < n1; i++) {
{
cgemv_kernel_4x4(NB, ap, x_ptr, ybuffer); cgemv_kernel_4x4(NB, ap, x_ptr, ybuffer);
ap[0] += lda4; ap[0] += lda4;
ap[1] += lda4; ap[1] += lda4;
@ -412,27 +455,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 8; x_ptr += 8;
} }
if ( n2 & 2 ) if (n2 & 2) {
{
cgemv_kernel_4x2(NB, ap, x_ptr, ybuffer); cgemv_kernel_4x2(NB, ap, x_ptr, ybuffer);
x_ptr += 4; x_ptr += 4;
a_ptr += 2 * lda; a_ptr += 2 * lda;
} }
if ( n2 & 1 ) if (n2 & 1) {
{
cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
/* x_ptr += 2; /* x_ptr += 2;
a_ptr += lda; */ a_ptr += lda; */
} }
} } else {
else
{
for( i = 0; i < n1 ; i++) for (i = 0; i < n1; i++) {
{
xbuffer[0] = x_ptr[0]; xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1]; xbuffer[1] = x_ptr[1];
@ -455,8 +493,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
a_ptr += lda4; a_ptr += lda4;
} }
for( i = 0; i < n2 ; i++) for (i = 0; i < n2; i++) {
{
xbuffer[0] = x_ptr[0]; xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1]; xbuffer[1] = x_ptr[1];
x_ptr += inc_x; x_ptr += inc_x;
@ -472,21 +509,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
y_ptr += NB * inc_y; y_ptr += NB * inc_y;
} }
if ( m3 == 0 ) return(0); if (m3 == 0)
return (0);
if ( m3 == 1 ) if (m3 == 1) {
{
a_ptr = a; a_ptr = a;
x_ptr = x; x_ptr = x;
FLOAT temp_r = 0.0; FLOAT temp_r = 0.0;
FLOAT temp_i = 0.0; FLOAT temp_i = 0.0;
if ( lda == 2 && inc_x == 2 ) if (lda == 2 && inc_x == 2) {
{
for (i = 0; i < (n & -2); i += 2) {
for( i=0 ; i < (n & -2); i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -503,10 +537,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 4; x_ptr += 4;
} }
for (; i < n; i++) {
for( ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -519,13 +550,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 2; x_ptr += 2;
} }
} else {
} for (i = 0; i < n; i++) {
else
{
for( i = 0; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -549,8 +576,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
return (0); return (0);
} }
if ( m3 == 2 ) if (m3 == 2) {
{
a_ptr = a; a_ptr = a;
x_ptr = x; x_ptr = x;
FLOAT temp_r0 = 0.0; FLOAT temp_r0 = 0.0;
@ -558,11 +584,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
FLOAT temp_r1 = 0.0; FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0; FLOAT temp_i1 = 0.0;
if ( lda == 4 && inc_x == 2 ) if (lda == 4 && inc_x == 2) {
{
for( i = 0; i < (n & -2); i+=2 ) for (i = 0; i < (n & -2); i += 2) {
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
@ -592,9 +616,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 4; x_ptr += 4;
} }
for (; i < n; i++) {
for( ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -611,13 +633,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 2; x_ptr += 2;
} }
} else {
} for (i = 0; i < n; i++) {
else
{
for( i=0 ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -634,7 +652,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += inc_x; x_ptr += inc_x;
} }
} }
#if !defined(XCONJ) #if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
@ -652,9 +669,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
return (0); return (0);
} }
if (m3 == 3) {
if ( m3 == 3 )
{
a_ptr = a; a_ptr = a;
x_ptr = x; x_ptr = x;
FLOAT temp_r0 = 0.0; FLOAT temp_r0 = 0.0;
@ -664,11 +679,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
FLOAT temp_r2 = 0.0; FLOAT temp_r2 = 0.0;
FLOAT temp_i2 = 0.0; FLOAT temp_i2 = 0.0;
if ( lda == 6 && inc_x == 2 ) if (lda == 6 && inc_x == 2) {
{
for( i=0 ; i < n; i++ ) for (i = 0; i < n; i++) {
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -689,13 +702,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 2; x_ptr += 2;
} }
} else {
} for (i = 0; i < n; i++) {
else
{
for( i = 0; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2014, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -29,84 +29,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NBMAX 2048 #define NBMAX 2048
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
{ FLOAT *alpha) {
__asm__ volatile ( register FLOAT *ap0 = ap[0];
"vzero %%v16 \n\t" register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t" "vzero %%v17\n\t"
"vzero %%v18\n\t" "vzero %%v18\n\t"
"vzero %%v19\n\t" "vzero %%v19\n\t"
"vzero %%v20\n\t"
"vzero %%v21\n\t"
"vzero %%v22\n\t"
"vzero %%v23\n\t"
"vleib %%v2,0,0\n\t"
"vleib %%v2,1,1\n\t"
"vleib %%v2,2,2\n\t"
"vleib %%v2,3,3\n\t"
"vleib %%v2,0,4\n\t"
"vleib %%v2,1,5\n\t"
"vleib %%v2,2,6\n\t"
"vleib %%v2,3,7\n\t"
"vleib %%v2,8,8\n\t"
"vleib %%v2,9,9\n\t"
"vleib %%v2,10,10\n\t"
"vleib %%v2,11,11\n\t"
"vleib %%v2,8,12\n\t"
"vleib %%v2,9,13\n\t"
"vleib %%v2,10,14\n\t"
"vleib %%v2,11,15\n\t"
"vleib %%v3,4,0\n\t"
"vleib %%v3,5,1\n\t"
"vleib %%v3,6,2\n\t"
"vleib %%v3,7,3\n\t"
"vleib %%v3,4,4\n\t"
"vleib %%v3,5,5\n\t"
"vleib %%v3,6,6\n\t"
"vleib %%v3,7,7\n\t"
"vleib %%v3,12,8\n\t"
"vleib %%v3,13,9\n\t"
"vleib %%v3,14,10\n\t"
"vleib %%v3,15,11\n\t"
"vleib %%v3,12,12\n\t"
"vleib %%v3,13,13\n\t"
"vleib %%v3,14,14\n\t"
"vleib %%v3,15,15\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t" "srlg %[n],%[n],1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%4) \n\t" "pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 1,1024(%%r1,%5) \n\t" "pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
"vl %%v20,0(%%r1,%5) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v21,4(%%r1,%5),0 \n\t" "vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v21,12(%%r1,%5),2 \n\t" "vlef %%v1,12(%%r1,%[x]),2\n\t"
"vflcsb %%v21,%%v21 \n\t" "vflcsb %%v1,%%v1\n\t"
"vlef %%v21,0(%%r1,%5),1 \n\t" "vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v21,8(%%r1,%5),3 \n\t" "vlef %%v1,8(%%r1,%[x]),3\n\t"
#else #else
"vlef %%v21,0(%%r1,%5),1 \n\t" "vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v21,8(%%r1,%5),3 \n\t" "vlef %%v1,8(%%r1,%[x]),3\n\t"
"vflcsb %%v21,%%v21 \n\t" "vflcsb %%v1,%%v1\n\t"
"vlef %%v21,4(%%r1,%5),0 \n\t" "vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v21,12(%%r1,%5),2 \n\t" "vlef %%v1,12(%%r1,%[x]),2\n\t"
#endif #endif
"vl %%v24,0(%%r1,%[ap0])\n\t"
"vlef %%v22,0(%%r1,%1),0 \n\t" "vperm %%v25,%%v24,%%v24,%%v3\n\t"
"vlef %%v22,0(%%r1,%1),1 \n\t" "vperm %%v24,%%v24,%%v24,%%v2\n\t"
"vlef %%v22,8(%%r1,%1),2 \n\t" "vl %%v26,0(%%r1,%[ap1])\n\t"
"vlef %%v22,8(%%r1,%1),3 \n\t" "vperm %%v27,%%v26,%%v26,%%v3\n\t"
"vlef %%v23,4(%%r1,%1),0 \n\t" "vperm %%v26,%%v26,%%v26,%%v2\n\t"
"vlef %%v23,4(%%r1,%1),1 \n\t" "vl %%v28,0(%%r1,%[ap2])\n\t"
"vlef %%v23,12(%%r1,%1),2 \n\t" "vperm %%v29,%%v28,%%v28,%%v3\n\t"
"vlef %%v23,12(%%r1,%1),3 \n\t" "vperm %%v28,%%v28,%%v28,%%v2\n\t"
"vlef %%v24,0(%%r1,%2),0 \n\t" "vl %%v30,0(%%r1,%[ap3])\n\t"
"vlef %%v24,0(%%r1,%2),1 \n\t" "vperm %%v31,%%v30,%%v30,%%v3\n\t"
"vlef %%v24,8(%%r1,%2),2 \n\t" "vperm %%v30,%%v30,%%v30,%%v2\n\t"
"vlef %%v24,8(%%r1,%2),3 \n\t" "vfmasb %%v16,%%v24,%%v0,%%v16\n\t"
"vlef %%v25,4(%%r1,%2),0 \n\t" "vfmasb %%v20,%%v25,%%v1,%%v20\n\t"
"vlef %%v25,4(%%r1,%2),1 \n\t" "vfmasb %%v17,%%v26,%%v0,%%v17\n\t"
"vlef %%v25,12(%%r1,%2),2 \n\t" "vfmasb %%v21,%%v27,%%v1,%%v21\n\t"
"vlef %%v25,12(%%r1,%2),3 \n\t" "vfmasb %%v18,%%v28,%%v0,%%v18\n\t"
"vfmasb %%v22,%%v29,%%v1,%%v22\n\t"
"vfmasb %%v16,%%v22,%%v20,%%v16 \n\t" "vfmasb %%v19,%%v30,%%v0,%%v19\n\t"
"vfmasb %%v16,%%v23,%%v21,%%v16 \n\t" "vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
"vfmasb %%v17,%%v24,%%v20,%%v17 \n\t"
"vfmasb %%v17,%%v25,%%v21,%%v17 \n\t"
"vlef %%v26,0(%%r1,%3),0 \n\t"
"vlef %%v26,0(%%r1,%3),1 \n\t"
"vlef %%v26,8(%%r1,%3),2 \n\t"
"vlef %%v26,8(%%r1,%3),3 \n\t"
"vlef %%v27,4(%%r1,%3),0 \n\t"
"vlef %%v27,4(%%r1,%3),1 \n\t"
"vlef %%v27,12(%%r1,%3),2 \n\t"
"vlef %%v27,12(%%r1,%3),3 \n\t"
"vlef %%v28,0(%%r1,%4),0 \n\t"
"vlef %%v28,0(%%r1,%4),1 \n\t"
"vlef %%v28,8(%%r1,%4),2 \n\t"
"vlef %%v28,8(%%r1,%4),3 \n\t"
"vlef %%v29,4(%%r1,%4),0 \n\t"
"vlef %%v29,4(%%r1,%4),1 \n\t"
"vlef %%v29,12(%%r1,%4),2 \n\t"
"vlef %%v29,12(%%r1,%4),3 \n\t"
"vfmasb %%v18,%%v26,%%v20,%%v18 \n\t"
"vfmasb %%v18,%%v27,%%v21,%%v18 \n\t"
"vfmasb %%v19,%%v28,%%v20,%%v19 \n\t"
"vfmasb %%v19,%%v29,%%v21,%%v19 \n\t"
"agfi %%r1,16\n\t" "agfi %%r1,16\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfasb %%v16,%%v16,%%v20\n\t"
"vfasb %%v17,%%v17,%%v21\n\t"
"vfasb %%v18,%%v18,%%v22\n\t"
"vfasb %%v19,%%v19,%%v23\n\t"
"vrepg %%v20,%%v16,1\n\t" "vrepg %%v20,%%v16,1\n\t"
"vrepg %%v21,%%v17,1\n\t" "vrepg %%v21,%%v17,1\n\t"
"vrepg %%v22,%%v18,1\n\t" "vrepg %%v22,%%v18,1\n\t"
@ -120,86 +137,115 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *
"verllg %%v18,%%v16,32\n\t" "verllg %%v18,%%v16,32\n\t"
"verllg %%v19,%%v17,32\n\t" "verllg %%v19,%%v17,32\n\t"
#if !defined(XCONJ) #if !defined(XCONJ)
"vlrepf %%v20,0(%7) \n\t" "vlrepf %%v20,0(%[alpha])\n\t"
"vlef %%v21,4(%7),0 \n\t" "vlef %%v21,4(%[alpha]),0\n\t"
"vlef %%v21,4(%7),2 \n\t" "vlef %%v21,4(%[alpha]),2\n\t"
"vflcsb %%v21,%%v21\n\t" "vflcsb %%v21,%%v21\n\t"
"vlef %%v21,4(%7),1 \n\t" "vlef %%v21,4(%[alpha]),1\n\t"
"vlef %%v21,4(%7),3 \n\t" "vlef %%v21,4(%[alpha]),3\n\t"
#else #else
"vlef %%v20,0(%7),1 \n\t" "vlef %%v20,0(%[alpha]),1\n\t"
"vlef %%v20,0(%7),3 \n\t" "vlef %%v20,0(%[alpha]),3\n\t"
"vflcsb %%v20,%%v20\n\t" "vflcsb %%v20,%%v20\n\t"
"vlef %%v20,0(%7),0 \n\t" "vlef %%v20,0(%[alpha]),0\n\t"
"vlef %%v20,0(%7),2 \n\t" "vlef %%v20,0(%[alpha]),2\n\t"
"vlrepf %%v21,4(%7) \n\t" "vlrepf %%v21,4(%[alpha])\n\t"
#endif #endif
"vl %%v22,0(%6) \n\t" "vl %%v22,0(%[y])\n\t"
"vl %%v23,16(%6) \n\t" "vl %%v23,16(%[y])\n\t"
"vfmasb %%v22,%%v16,%%v20,%%v22\n\t" "vfmasb %%v22,%%v16,%%v20,%%v22\n\t"
"vfmasb %%v22,%%v18,%%v21,%%v22\n\t" "vfmasb %%v22,%%v18,%%v21,%%v22\n\t"
"vfmasb %%v23,%%v17,%%v20,%%v23\n\t" "vfmasb %%v23,%%v17,%%v20,%%v23\n\t"
"vfmasb %%v23,%%v19,%%v21,%%v23\n\t" "vfmasb %%v23,%%v19,%%v21,%%v23\n\t"
"vst %%v22,0(%6) \n\t" "vst %%v22,0(%[y])\n\t"
"vst %%v23,16(%6) " "vst %%v23,16(%[y])"
: : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[8])y),"ZQ"((const FLOAT (*)[2])alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29" "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
); "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
{ FLOAT *alpha) {
__asm__ volatile ( register FLOAT *ap0 = ap[0];
"vzero %%v16 \n\t" register FLOAT *ap1 = ap[1];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t" "vzero %%v17\n\t"
"vzero %%v18\n\t"
"vzero %%v19\n\t"
"vleib %%v2,0,0\n\t"
"vleib %%v2,1,1\n\t"
"vleib %%v2,2,2\n\t"
"vleib %%v2,3,3\n\t"
"vleib %%v2,0,4\n\t"
"vleib %%v2,1,5\n\t"
"vleib %%v2,2,6\n\t"
"vleib %%v2,3,7\n\t"
"vleib %%v2,8,8\n\t"
"vleib %%v2,9,9\n\t"
"vleib %%v2,10,10\n\t"
"vleib %%v2,11,11\n\t"
"vleib %%v2,8,12\n\t"
"vleib %%v2,9,13\n\t"
"vleib %%v2,10,14\n\t"
"vleib %%v2,11,15\n\t"
"vleib %%v3,4,0\n\t"
"vleib %%v3,5,1\n\t"
"vleib %%v3,6,2\n\t"
"vleib %%v3,7,3\n\t"
"vleib %%v3,4,4\n\t"
"vleib %%v3,5,5\n\t"
"vleib %%v3,6,6\n\t"
"vleib %%v3,7,7\n\t"
"vleib %%v3,12,8\n\t"
"vleib %%v3,13,9\n\t"
"vleib %%v3,14,10\n\t"
"vleib %%v3,15,11\n\t"
"vleib %%v3,12,12\n\t"
"vleib %%v3,13,13\n\t"
"vleib %%v3,14,14\n\t"
"vleib %%v3,15,15\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t" "srlg %[n],%[n],1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
"vl %%v18,0(%%r1,%3) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v19,4(%%r1,%3),0 \n\t" "vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v19,12(%%r1,%3),2 \n\t" "vlef %%v1,12(%%r1,%[x]),2\n\t"
"vflcsb %%v19,%%v19 \n\t" "vflcsb %%v1,%%v1\n\t"
"vlef %%v19,0(%%r1,%3),1 \n\t" "vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v19,8(%%r1,%3),3 \n\t" "vlef %%v1,8(%%r1,%[x]),3\n\t"
#else #else
"vlef %%v19,0(%%r1,%3),1 \n\t" "vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v19,8(%%r1,%3),3 \n\t" "vlef %%v1,8(%%r1,%[x]),3\n\t"
"vflcsb %%v19,%%v19 \n\t" "vflcsb %%v1,%%v1\n\t"
"vlef %%v19,4(%%r1,%3),0 \n\t" "vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v19,12(%%r1,%3),2 \n\t" "vlef %%v1,12(%%r1,%[x]),2\n\t"
#endif #endif
"vl %%v20,0(%%r1,%[ap0])\n\t"
"vlef %%v20,0(%%r1,%1),0 \n\t" "vperm %%v21,%%v20,%%v20,%%v3\n\t"
"vlef %%v20,0(%%r1,%1),1 \n\t" "vperm %%v20,%%v20,%%v20,%%v2\n\t"
"vlef %%v20,8(%%r1,%1),2 \n\t" "vl %%v22,0(%%r1,%[ap1])\n\t"
"vlef %%v20,8(%%r1,%1),3 \n\t" "vperm %%v23,%%v22,%%v22,%%v3\n\t"
"vlef %%v21,4(%%r1,%1),0 \n\t" "vperm %%v22,%%v22,%%v22,%%v2\n\t"
"vlef %%v21,4(%%r1,%1),1 \n\t" "vfmasb %%v16,%%v20,%%v0,%%v16\n\t"
"vlef %%v21,12(%%r1,%1),2 \n\t" "vfmasb %%v18,%%v21,%%v1,%%v18\n\t"
"vlef %%v21,12(%%r1,%1),3 \n\t" "vfmasb %%v17,%%v22,%%v0,%%v17\n\t"
"vlef %%v22,0(%%r1,%2),0 \n\t" "vfmasb %%v19,%%v23,%%v1,%%v19\n\t"
"vlef %%v22,0(%%r1,%2),1 \n\t"
"vlef %%v22,8(%%r1,%2),2 \n\t"
"vlef %%v22,8(%%r1,%2),3 \n\t"
"vlef %%v23,4(%%r1,%2),0 \n\t"
"vlef %%v23,4(%%r1,%2),1 \n\t"
"vlef %%v23,12(%%r1,%2),2 \n\t"
"vlef %%v23,12(%%r1,%2),3 \n\t"
"vfmasb %%v16,%%v20,%%v18,%%v16 \n\t"
"vfmasb %%v16,%%v21,%%v19,%%v16 \n\t"
"vfmasb %%v17,%%v22,%%v18,%%v17 \n\t"
"vfmasb %%v17,%%v23,%%v19,%%v17 \n\t"
"agfi %%r1,16\n\t" "agfi %%r1,16\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfasb %%v16,%%v16,%%v18\n\t"
"vfasb %%v17,%%v17,%%v19\n\t"
"vrepg %%v18,%%v16,1\n\t" "vrepg %%v18,%%v16,1\n\t"
"vrepg %%v19,%%v17,1\n\t" "vrepg %%v19,%%v17,1\n\t"
"vfasb %%v16,%%v16,%%v18\n\t" "vfasb %%v16,%%v16,%%v18\n\t"
@ -207,99 +253,124 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *
"vmrhg %%v16,%%v16,%%v17\n\t" "vmrhg %%v16,%%v16,%%v17\n\t"
"verllg %%v17,%%v16,32\n\t" "verllg %%v17,%%v16,32\n\t"
#if !defined(XCONJ) #if !defined(XCONJ)
"vlrepf %%v18,0(%5) \n\t" "vlrepf %%v18,0(%[alpha])\n\t"
"vlef %%v19,4(%5),0 \n\t" "vlef %%v19,4(%[alpha]),0\n\t"
"vlef %%v19,4(%5),2 \n\t" "vlef %%v19,4(%[alpha]),2\n\t"
"vflcsb %%v19,%%v19\n\t" "vflcsb %%v19,%%v19\n\t"
"vlef %%v19,4(%5),1 \n\t" "vlef %%v19,4(%[alpha]),1\n\t"
"vlef %%v19,4(%5),3 \n\t" "vlef %%v19,4(%[alpha]),3\n\t"
#else #else
"vlef %%v18,0(%5),1 \n\t" "vlef %%v18,0(%[alpha]),1\n\t"
"vlef %%v18,0(%5),3 \n\t" "vlef %%v18,0(%[alpha]),3\n\t"
"vflcsb %%v18,%%v18\n\t" "vflcsb %%v18,%%v18\n\t"
"vlef %%v18,0(%5),0 \n\t" "vlef %%v18,0(%[alpha]),0\n\t"
"vlef %%v18,0(%5),2 \n\t" "vlef %%v18,0(%[alpha]),2\n\t"
"vlrepf %%v19,4(%5) \n\t" "vlrepf %%v19,4(%[alpha])\n\t"
#endif #endif
"vl %%v20,0(%4) \n\t" "vl %%v20,0(%[y])\n\t"
"vfmasb %%v20,%%v16,%%v18,%%v20\n\t" "vfmasb %%v20,%%v16,%%v18,%%v20\n\t"
"vfmasb %%v20,%%v17,%%v19,%%v20\n\t" "vfmasb %%v20,%%v17,%%v19,%%v20\n\t"
"vst %%v20,0(%4) " "vst %%v20,0(%[y])"
: : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[4])y),"ZQ"((const FLOAT (*)[2])alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23" "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
); "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23");
} }
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
{ FLOAT *alpha) {
__asm__ volatile ( __asm__("vzero %%v16\n\t"
"vzero %%v16 \n\t" "vzero %%v17\n\t"
"vleib %%v2,0,0\n\t"
"vleib %%v2,1,1\n\t"
"vleib %%v2,2,2\n\t"
"vleib %%v2,3,3\n\t"
"vleib %%v2,0,4\n\t"
"vleib %%v2,1,5\n\t"
"vleib %%v2,2,6\n\t"
"vleib %%v2,3,7\n\t"
"vleib %%v2,8,8\n\t"
"vleib %%v2,9,9\n\t"
"vleib %%v2,10,10\n\t"
"vleib %%v2,11,11\n\t"
"vleib %%v2,8,12\n\t"
"vleib %%v2,9,13\n\t"
"vleib %%v2,10,14\n\t"
"vleib %%v2,11,15\n\t"
"vleib %%v3,4,0\n\t"
"vleib %%v3,5,1\n\t"
"vleib %%v3,6,2\n\t"
"vleib %%v3,7,3\n\t"
"vleib %%v3,4,4\n\t"
"vleib %%v3,5,5\n\t"
"vleib %%v3,6,6\n\t"
"vleib %%v3,7,7\n\t"
"vleib %%v3,12,8\n\t"
"vleib %%v3,13,9\n\t"
"vleib %%v3,14,10\n\t"
"vleib %%v3,15,11\n\t"
"vleib %%v3,12,12\n\t"
"vleib %%v3,13,13\n\t"
"vleib %%v3,14,14\n\t"
"vleib %%v3,15,15\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t" "srlg %[n],%[n],1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
"vl %%v17,0(%%r1,%2) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vlef %%v18,4(%%r1,%2),0 \n\t" "vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v18,12(%%r1,%2),2 \n\t" "vlef %%v1,12(%%r1,%[x]),2\n\t"
"vflcsb %%v18,%%v18 \n\t" "vflcsb %%v1,%%v1\n\t"
"vlef %%v18,0(%%r1,%2),1 \n\t" "vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v18,8(%%r1,%2),3 \n\t" "vlef %%v1,8(%%r1,%[x]),3\n\t"
#else #else
"vlef %%v18,0(%%r1,%2),1 \n\t" "vlef %%v1,0(%%r1,%[x]),1\n\t"
"vlef %%v18,8(%%r1,%2),3 \n\t" "vlef %%v1,8(%%r1,%[x]),3\n\t"
"vflcsb %%v18,%%v18 \n\t" "vflcsb %%v1,%%v1\n\t"
"vlef %%v18,4(%%r1,%2),0 \n\t" "vlef %%v1,4(%%r1,%[x]),0\n\t"
"vlef %%v18,12(%%r1,%2),2 \n\t" "vlef %%v1,12(%%r1,%[x]),2\n\t"
#endif #endif
"vl %%v18,0(%%r1,%[ap])\n\t"
"vlef %%v19,0(%%r1,%1),0 \n\t" "vperm %%v19,%%v18,%%v18,%%v3\n\t"
"vlef %%v19,0(%%r1,%1),1 \n\t" "vperm %%v18,%%v18,%%v18,%%v2\n\t"
"vlef %%v19,8(%%r1,%1),2 \n\t" "vfmasb %%v16,%%v18,%%v0,%%v16\n\t"
"vlef %%v19,8(%%r1,%1),3 \n\t" "vfmasb %%v17,%%v19,%%v1,%%v17\n\t"
"vlef %%v20,4(%%r1,%1),0 \n\t"
"vlef %%v20,4(%%r1,%1),1 \n\t"
"vlef %%v20,12(%%r1,%1),2 \n\t"
"vlef %%v20,12(%%r1,%1),3 \n\t"
"vfmasb %%v16,%%v19,%%v17,%%v16 \n\t"
"vfmasb %%v16,%%v20,%%v18,%%v16 \n\t"
"agfi %%r1,16\n\t" "agfi %%r1,16\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfasb %%v16,%%v16,%%v17\n\t"
"vrepg %%v17,%%v16,1\n\t" "vrepg %%v17,%%v16,1\n\t"
"vfasb %%v16,%%v16,%%v17\n\t" "vfasb %%v16,%%v16,%%v17\n\t"
"verllg %%v17,%%v16,32\n\t" "verllg %%v17,%%v16,32\n\t"
#if !defined(XCONJ) #if !defined(XCONJ)
"vlrepf %%v18,0(%4) \n\t" "vlrepf %%v18,0(%[alpha])\n\t"
"vlef %%v19,4(%4),0 \n\t" "vlef %%v19,4(%[alpha]),0\n\t"
"vflcsb %%v19,%%v19\n\t" "vflcsb %%v19,%%v19\n\t"
"vlef %%v19,4(%4),1 \n\t" "vlef %%v19,4(%[alpha]),1\n\t"
#else #else
"vlef %%v18,0(%4),1 \n\t" "vlef %%v18,0(%[alpha]),1\n\t"
"vflcsb %%v18,%%v18\n\t" "vflcsb %%v18,%%v18\n\t"
"vlef %%v18,0(%4),0 \n\t" "vlef %%v18,0(%[alpha]),0\n\t"
"vlrepf %%v19,4(%4) \n\t" "vlrepf %%v19,4(%[alpha])\n\t"
#endif #endif
"vleg %%v20,0(%3),0 \n\t" "vleg %%v0,0(%[y]),0\n\t"
"vfmasb %%v20,%%v16,%%v18,%%v20 \n\t" "vfmasb %%v0,%%v16,%%v18,%%v0\n\t"
"vfmasb %%v20,%%v17,%%v19,%%v20 \n\t" "vfmasb %%v0,%%v17,%%v19,%%v0\n\t"
"vsteg %%v20,0(%3),0 " "vsteg %%v0,0(%[y]),0"
: : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[2])y),"ZQ"((const FLOAT (*)[2])alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23" "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
); "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
} }
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
{
BLASLONG i; BLASLONG i;
for ( i=0; i<n; i++ ) for (i = 0; i < n; i++) {
{
*dest = *src; *dest = *src;
*(dest + 1) = *(src + 1); *(dest + 1) = *(src + 1);
dest += 2; dest += 2;
@ -307,8 +378,9 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
} }
} }
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
{ FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i; BLASLONG i;
BLASLONG j; BLASLONG j;
FLOAT *a_ptr; FLOAT *a_ptr;
@ -324,8 +396,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT ybuffer[8], *xbuffer; FLOAT ybuffer[8], *xbuffer;
FLOAT alpha[2]; FLOAT alpha[2];
if ( m < 1 ) return(0); if (m < 1)
if ( n < 1 ) return(0); return (0);
if (n < 1)
return (0);
inc_x <<= 1; inc_x <<= 1;
inc_y <<= 1; inc_y <<= 1;
@ -346,13 +420,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
BLASLONG NB = NBMAX; BLASLONG NB = NBMAX;
while ( NB == NBMAX ) while (NB == NBMAX) {
{
m1 -= NB; m1 -= NB;
if ( m1 < 0) if (m1 < 0) {
{ if (m2 == 0)
if ( m2 == 0 ) break; break;
NB = m2; NB = m2;
} }
@ -368,11 +441,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
else else
xbuffer = x_ptr; xbuffer = x_ptr;
if ( inc_y == 2 ) if (inc_y == 2) {
{
for( i = 0; i < n1 ; i++) for (i = 0; i < n1; i++) {
{
cgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha); cgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha);
ap[0] += lda4; ap[0] += lda4;
ap[1] += lda4; ap[1] += lda4;
@ -383,28 +454,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
} }
if ( n2 & 2 ) if (n2 & 2) {
{
cgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha); cgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha);
a_ptr += lda * 2; a_ptr += lda * 2;
y_ptr += 4; y_ptr += 4;
} }
if ( n2 & 1 ) if (n2 & 1) {
{
cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
/* a_ptr += lda; /* a_ptr += lda;
y_ptr += 2; */ y_ptr += 2; */
} }
} } else {
else
{
for( i = 0; i < n1 ; i++) for (i = 0; i < n1; i++) {
{
memset(ybuffer, 0, sizeof(ybuffer)); memset(ybuffer, 0, sizeof(ybuffer));
cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha); cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha);
ap[0] += lda4; ap[0] += lda4;
@ -428,8 +494,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
} }
for( i = 0; i < n2 ; i++) for (i = 0; i < n2; i++) {
{
memset(ybuffer, 0, sizeof(ybuffer)); memset(ybuffer, 0, sizeof(ybuffer));
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha); cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha);
a_ptr += lda; a_ptr += lda;
@ -444,17 +509,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
x += NB * inc_x; x += NB * inc_x;
} }
if (m3 == 0)
return (0);
if ( m3 == 0 ) return(0);
x_ptr = x; x_ptr = x;
j = 0; j = 0;
a_ptr = a; a_ptr = a;
y_ptr = y; y_ptr = y;
if ( m3 == 3 ) if (m3 == 3) {
{
FLOAT temp_r; FLOAT temp_r;
FLOAT temp_i; FLOAT temp_i;
@ -466,8 +529,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
x_ptr += inc_x; x_ptr += inc_x;
FLOAT x4 = x_ptr[0]; FLOAT x4 = x_ptr[0];
FLOAT x5 = x_ptr[1]; FLOAT x5 = x_ptr[1];
while ( j < n) while (j < n) {
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -500,9 +562,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
return (0); return (0);
} }
if (m3 == 2) {
if ( m3 == 2 )
{
FLOAT temp_r; FLOAT temp_r;
FLOAT temp_i; FLOAT temp_i;
@ -516,8 +576,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT ar = alpha[0]; FLOAT ar = alpha[0];
FLOAT ai = alpha[1]; FLOAT ai = alpha[1];
while ( j < ( n & -2 )) while (j < (n & -2)) {
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -560,9 +619,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
j += 2; j += 2;
} }
while (j < n) {
while ( j < n)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -592,9 +649,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
return (0); return (0);
} }
if (m3 == 1) {
if ( m3 == 1 )
{
FLOAT temp_r; FLOAT temp_r;
FLOAT temp_i; FLOAT temp_i;
@ -605,8 +660,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT ar = alpha[0]; FLOAT ar = alpha[0];
FLOAT ai = alpha[1]; FLOAT ai = alpha[1];
while ( j < ( n & -2 )) while (j < (n & -2)) {
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -641,8 +695,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
j += 2; j += 2;
} }
while ( j < n) while (j < n) {
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,25 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
{ __asm__("vlrepf %%v0,%[c]\n\t"
__asm__ ( "vlrepf %%v1,%[s]\n\t"
"vlrepf %%v0,%3 \n\t" "srlg %[n],%[n],5\n\t"
"vlrepf %%v1,%4 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%1) \n\t" "vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%1) \n\t" "vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%1) \n\t" "vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%1) \n\t" "vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -63,25 +60,22 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t" "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v28, 0(%%r1,%1) \n\t" "vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%1) \n\t" "vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%1) \n\t" "vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%1) \n\t" "vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v20, 0(%%r1,%2) \n\t" "vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%2) \n\t" "vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%2) \n\t" "vst %%v23, 48(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%2) \n\t" "vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v24, 64(%%r1,%1) \n\t" "vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%1) \n\t" "vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%1) \n\t" "vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v27, 112(%%r1,%1) \n\t" "vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v16, 64(%%r1,%2) \n\t" "vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%2) \n\t" "vl %%v19, 112(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0\n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -99,25 +93,22 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t" "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v28, 64(%%r1,%1) \n\t" "vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%1) \n\t" "vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%1) \n\t" "vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%1) \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v27, 176(%%r1,%1) \n\t" "vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v16, 128(%%r1,%2) \n\t" "vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%2) \n\t" "vl %%v19, 176(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0\n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -135,25 +126,22 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t" "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v28, 128(%%r1,%1) \n\t" "vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%1) \n\t" "vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%1) \n\t" "vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%1) \n\t" "vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v20, 128(%%r1,%2) \n\t" "vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%2) \n\t" "vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%2) \n\t" "vst %%v23, 176(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%2) \n\t" "vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v24, 192(%%r1,%1) \n\t" "vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%1) \n\t" "vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%1) \n\t" "vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v27, 240(%%r1,%1) \n\t" "vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v16, 192(%%r1,%2) \n\t" "vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%2) \n\t" "vl %%v19, 240(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0\n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -171,40 +159,39 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t" "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v28, 192(%%r1,%1) \n\t" "vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%1) \n\t" "vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%1) \n\t" "vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%1) \n\t" "vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v20, 192(%%r1,%2) \n\t" "vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%2) \n\t" "vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%2) \n\t" "vst %%v23, 240(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) x),
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
); : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
{ FLOAT c, FLOAT s) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
FLOAT temp[2]; FLOAT temp[2];
BLASLONG inc_x2; BLASLONG inc_x2;
BLASLONG inc_y2; BLASLONG inc_y2;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{
FLOAT cosa, sina; FLOAT cosa, sina;
cosa = c; cosa = c;
sina = s; sina = s;
@ -213,8 +200,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
ix = 2 * n1; ix = 2 * n1;
} }
while(i < n) while (i < n) {
{
temp[0] = c * x[ix] + s * y[ix]; temp[0] = c * x[ix] + s * y[ix];
temp[1] = c * x[ix + 1] + s * y[ix + 1]; temp[1] = c * x[ix + 1] + s * y[ix + 1];
y[ix] = c * y[ix] - s * x[ix]; y[ix] = c * y[ix] - s * x[ix];
@ -227,14 +213,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
} }
} else {
}
else
{
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y; inc_y2 = 2 * inc_y;
while(i < n) while (i < n) {
{
temp[0] = c * x[ix] + s * y[iy]; temp[0] = c * x[ix] + s * y[iy];
temp[1] = c * x[ix + 1] + s * y[iy + 1]; temp[1] = c * x[ix + 1] + s * y[iy + 1];
y[iy] = c * y[iy] - s * x[ix]; y[iy] = c * y[iy] - s * x[ix];
@ -252,5 +234,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
return (0); return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013 - 2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,28 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) {
{ __asm__("vlrepf %%v0,0(%[alpha])\n\t"
__asm__ volatile( "vlef %%v1,4(%[alpha]),0\n\t"
"vlrepf %%v0,0(%1) \n\t" "vlef %%v1,4(%[alpha]),2\n\t"
"vlef %%v1,4(%1),0 \n\t"
"vlef %%v1,4(%1),2 \n\t"
"vflcsb %%v1,%%v1\n\t" "vflcsb %%v1,%%v1\n\t"
"vlef %%v1,4(%1),1 \n\t" "vlef %%v1,4(%[alpha]),1\n\t"
"vlef %%v1,4(%1),3 \n\t" "vlef %%v1,4(%[alpha]),3\n\t"
"srlg %%r0,%0,4 \n\t" "srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32\n\t" "verllg %%v24,%%v16,32\n\t"
"verllg %%v25,%%v17,32\n\t" "verllg %%v25,%%v17,32\n\t"
"verllg %%v26,%%v18,32\n\t" "verllg %%v26,%%v18,32\n\t"
@ -57,7 +54,6 @@ static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
"verllg %%v29,%%v21,32\n\t" "verllg %%v29,%%v21,32\n\t"
"verllg %%v30,%%v22,32\n\t" "verllg %%v30,%%v22,32\n\t"
"verllg %%v31,%%v23,32\n\t" "verllg %%v31,%%v23,32\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t" "vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t" "vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t" "vfmsb %%v18,%%v18,%%v0\n\t"
@ -74,45 +70,42 @@ static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vfmasb %%v21,%%v29,%%v1,%%v21\n\t" "vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmasb %%v22,%%v30,%%v1,%%v22\n\t" "vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmasb %%v23,%%v31,%%v1,%%v23\n\t" "vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%2) \n\t" "vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%2) \n\t" "vst %%v23,112(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" [alpha] "a"(alpha)
); : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
{ __asm__("vlef %%v0,4(%[alpha]),0\n\t"
__asm__ volatile( "vlef %%v0,4(%[alpha]),2\n\t"
"vlef %%v0,4(%1),0 \n\t"
"vlef %%v0,4(%1),2 \n\t"
"vflcsb %%v0,%%v0\n\t" "vflcsb %%v0,%%v0\n\t"
"vlef %%v0,4(%1),1 \n\t" "vlef %%v0,4(%[alpha]),1\n\t"
"vlef %%v0,4(%1),3 \n\t" "vlef %%v0,4(%[alpha]),3\n\t"
"srlg %%r0,%0,4 \n\t" "srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v16,%%v16,32\n\t" "verllg %%v16,%%v16,32\n\t"
"verllg %%v17,%%v17,32\n\t" "verllg %%v17,%%v17,32\n\t"
"verllg %%v18,%%v18,32\n\t" "verllg %%v18,%%v18,32\n\t"
@ -121,7 +114,6 @@ static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
"verllg %%v21,%%v21,32\n\t" "verllg %%v21,%%v21,32\n\t"
"verllg %%v22,%%v22,32\n\t" "verllg %%v22,%%v22,32\n\t"
"verllg %%v23,%%v23,32\n\t" "verllg %%v23,%%v23,32\n\t"
"vfmsb %%v16,%%v16,%%v0\n\t" "vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t" "vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t" "vfmsb %%v18,%%v18,%%v0\n\t"
@ -130,42 +122,37 @@ static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vfmsb %%v21,%%v21,%%v0\n\t" "vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t" "vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t" "vfmsb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%2) \n\t" "vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%2) \n\t" "vst %%v23,112(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" [alpha] "a"(alpha)
); : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
} }
static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
{ __asm__("vlrepf %%v0,0(%[alpha])\n\t"
__asm__ volatile( "srlg %[n],%[n],4\n\t"
"vlrepf %%v0,0(%1) \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfmsb %%v16,%%v16,%%v0\n\t" "vfmsb %%v16,%%v16,%%v0\n\t"
"vfmsb %%v17,%%v17,%%v0\n\t" "vfmsb %%v17,%%v17,%%v0\n\t"
"vfmsb %%v18,%%v18,%%v0\n\t" "vfmsb %%v18,%%v18,%%v0\n\t"
@ -174,55 +161,46 @@ static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vfmsb %%v21,%%v21,%%v0\n\t" "vfmsb %%v21,%%v21,%%v0\n\t"
"vfmsb %%v22,%%v22,%%v0\n\t" "vfmsb %%v22,%%v22,%%v0\n\t"
"vfmsb %%v23,%%v23,%%v0\n\t" "vfmsb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%2) \n\t" "vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%2) \n\t" "vst %%v23,112(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" [alpha] "a"(alpha)
); : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
} }
static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
{ __asm__("vzero %%v0\n\t"
__asm__ volatile( "srlg %[n],%[n],4\n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v24,0(%%r1,%1) \n\t" "vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v25,16(%%r1,%1) \n\t" "vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v26,32(%%r1,%1) \n\t" "vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v27,48(%%r1,%1) \n\t" "vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v24,64(%%r1,%1) \n\t" "vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v25,80(%%r1,%1) \n\t" "vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v26,96(%%r1,%1) \n\t" "vst %%v0,112(%%r1,%[x])\n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
:"r"(n),"ZR"((FLOAT (*)[n * 2])x) : [x] "a"(x)
:"memory","cc","r0","r1","v24","v25","v26","v27" : "cc", "r1", "v0");
);
} }
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
{ BLASLONG inc_x) {
BLASLONG i; BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x; BLASLONG inc_x3 = inc_x2 + inc_x;
@ -230,8 +208,7 @@ static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
FLOAT da_r = alpha[0]; FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1]; FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4) for (i = 0; i < n; i += 4) {
{
t0 = da_r * x[0] - da_i * x[1]; t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
@ -251,7 +228,9 @@ static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
} }
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0, j = 0; BLASLONG i = 0, j = 0;
FLOAT temp0; FLOAT temp0;
FLOAT temp1; FLOAT temp1;
@ -311,13 +290,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
} }
} }
} else { } else {
if (da_i == 0.0) { if (da_i == 0.0) {
BLASLONG n1 = n & -2; BLASLONG n1 = n & -2;
@ -372,7 +348,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
return (0); return (0);
} }
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if (n1 > 0) { if (n1 > 0) {
@ -384,8 +359,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
cscal_kernel_16_zero(n1, x); cscal_kernel_16_zero(n1, x);
else else
cscal_kernel_16_zero_r(n1, alpha, x); cscal_kernel_16_zero_r(n1, alpha, x);
else else if (da_i == 0)
if (da_i == 0)
cscal_kernel_16_zero_i(n1, alpha, x); cscal_kernel_16_zero_i(n1, alpha, x);
else else
cscal_kernel_16(n1, alpha, x); cscal_kernel_16(n1, alpha, x);
@ -394,7 +368,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
j = n1; j = n1;
} }
if (da_r == 0.0) { if (da_r == 0.0) {
if (da_i == 0.0) { if (da_i == 0.0) {

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,114 +27,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],5\n\t"
__asm__ volatile(
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%1) \n\t" "vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%1) \n\t" "vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%1) \n\t" "vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%1) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%1) \n\t" "vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%1) \n\t" "vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%1) \n\t" "vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%1) \n\t" "vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%1) \n\t" "vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v0, 0(%%r1,%2) \n\t" "vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%2) \n\t" "vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%2) \n\t" "vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%2) \n\t" "vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%2) \n\t" "vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%2) \n\t" "vl %%v7, 112(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%2) \n\t" "vst %%v0, 0(%%r1,%[x])\n\t"
"vl %%v7, 112(%%r1,%2) \n\t" "vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v0, 0(%%r1,%1) \n\t" "vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%1) \n\t" "vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%1) \n\t" "vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%1) \n\t" "vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%1) \n\t" "vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%1) \n\t" "vst %%v7, 112(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%1) \n\t" "vl %%v0, 128(%%r1,%[y])\n\t"
"vst %%v7, 112(%%r1,%1) \n\t" "vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v0, 128(%%r1,%2) \n\t" "vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%2) \n\t" "vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%2) \n\t" "vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%2) \n\t" "vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%2) \n\t" "vl %%v7, 240(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%2) \n\t" "vst %%v0, 128(%%r1,%[x])\n\t"
"vl %%v6, 224(%%r1,%2) \n\t" "vst %%v1, 144(%%r1,%[x])\n\t"
"vl %%v7, 240(%%r1,%2) \n\t" "vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v0, 128(%%r1,%1) \n\t" "vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%1) \n\t" "vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%1) \n\t" "vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%1) \n\t" "vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%1) \n\t" "vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%1) \n\t" "vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v6, 224(%%r1,%1) \n\t" "vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v7, 240(%%r1,%1) \n\t" "vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v16, 0(%%r1,%2) \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%2) \n\t" "vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%2) \n\t" "vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%2) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%2) \n\t" "vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%2) \n\t" "vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%2) \n\t" "vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%2) \n\t" "vst %%v31, 240(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) x),
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : [x] "a"(x),[y] "a"(y)
); : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
{ FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
FLOAT temp[2]; FLOAT temp[2];
BLASLONG inc_x2, inc_y2; BLASLONG inc_x2, inc_y2;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 )) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{
cswap_kernel_32(n1, x, y); cswap_kernel_32(n1, x, y);
i = n1; i = n1;
ix = 2 * n1; ix = 2 * n1;
iy = 2 * n1; iy = 2 * n1;
} }
while(i < n) while (i < n) {
{
temp[0] = x[ix]; temp[0] = x[ix];
temp[1] = x[ix + 1]; temp[1] = x[ix + 1];
@ -147,19 +141,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
iy += 2; iy += 2;
i++; i++;
} }
} else {
}
else
{
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y; inc_y2 = 2 * inc_y;
while(i < n) while (i < n) {
{
temp[0] = x[ix]; temp[0] = x[ix];
temp[1] = x[ix + 1]; temp[1] = x[ix + 1];
@ -177,7 +166,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
} }
return (0); return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,40 +28,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
{
FLOAT amax; FLOAT amax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%2) \n\t" "vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%2) \n\t" "vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%2) \n\t" "vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%2) \n\t" "vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%2) \n\t" "vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vl %%v31,240(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxdb %%v16,%%v16,%%v24,8\n\t" "vfmaxdb %%v16,%%v16,%%v24,8\n\t"
"vfmaxdb %%v17,%%v17,%%v25,8\n\t" "vfmaxdb %%v17,%%v17,%%v25,8\n\t"
"vfmaxdb %%v18,%%v18,%%v26,8\n\t" "vfmaxdb %%v18,%%v18,%%v26,8\n\t"
@ -70,29 +62,23 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
"vfmaxdb %%v21,%%v21,%%v29,8\n\t" "vfmaxdb %%v21,%%v21,%%v29,8\n\t"
"vfmaxdb %%v22,%%v22,%%v30,8\n\t" "vfmaxdb %%v22,%%v22,%%v30,8\n\t"
"vfmaxdb %%v23,%%v23,%%v31,8\n\t" "vfmaxdb %%v23,%%v23,%%v31,8\n\t"
"vfmaxdb %%v16,%%v16,%%v20,8\n\t" "vfmaxdb %%v16,%%v16,%%v20,8\n\t"
"vfmaxdb %%v17,%%v17,%%v21,8\n\t" "vfmaxdb %%v17,%%v17,%%v21,8\n\t"
"vfmaxdb %%v18,%%v18,%%v22,8\n\t" "vfmaxdb %%v18,%%v18,%%v22,8\n\t"
"vfmaxdb %%v19,%%v19,%%v23,8\n\t" "vfmaxdb %%v19,%%v19,%%v23,8\n\t"
"vfmaxdb %%v16,%%v16,%%v18,8\n\t" "vfmaxdb %%v16,%%v16,%%v18,8\n\t"
"vfmaxdb %%v17,%%v17,%%v19,8\n\t" "vfmaxdb %%v17,%%v17,%%v19,8\n\t"
"vfmaxdb %%v16,%%v16,%%v17,8\n\t" "vfmaxdb %%v16,%%v16,%%v17,8\n\t"
"vfmaxdb %%v0,%%v0,%%v16,8\n\t" "vfmaxdb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t" "vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,8\n\t" "wfmaxdb %%v0,%%v0,%%v16,8\n\t"
"lpdr %0,%%f0 " "lpdr %[amax],%%f0"
:"=f"(amax) : [amax] "=f"(amax),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amax; return amax;
} }
@ -102,7 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
@ -112,9 +99,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = damax_kernel_32(n1, x); maxf = damax_kernel_32(n1, x);
i = n1; i = n1;
} } else {
else
{
maxf = ABS(x[0]); maxf = ABS(x[0]);
i++; i++;
} }
@ -153,7 +138,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (ABS(x[i]) > maxf) { if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]); maxf = ABS(x[i]);

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,32 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
{
FLOAT amax; FLOAT amax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t"
"vflpdb %%v0,%%v0\n\t" "vflpdb %%v0,%%v0\n\t"
"srlg %%r0,%1,5 \n\t" "srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -62,7 +55,6 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t" "vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t" "vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t" "vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t" "vfchdb %%v26,%%v20,%%v21\n\t"
@ -71,26 +63,22 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t" "vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t" "vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t" "vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t" "vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t" "vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t" "vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t" "vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t" "vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t" "vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t" "vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%2) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%2) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%2) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%2) \n\t" "vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%2) \n\t" "vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%2) \n\t" "vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%2) \n\t" "vl %%v23,240(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -99,7 +87,6 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t" "vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t" "vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t" "vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t" "vfchdb %%v26,%%v20,%%v21\n\t"
@ -108,29 +95,24 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t" "vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t" "vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t" "vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t" "vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t" "vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t" "vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t" "vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t" "vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t" "vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t" "vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t" "vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t" "wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t" "vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %0,%%f0 " "ldr %[amax],%%f0"
:"=f"(amax) : [amax] "=f"(amax),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amax; return amax;
} }
@ -140,7 +122,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
@ -150,9 +133,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = damax_kernel_32(n1, x); maxf = damax_kernel_32(n1, x);
i = n1; i = n1;
} } else {
else
{
maxf = ABS(x[0]); maxf = ABS(x[0]);
i++; i++;
} }
@ -191,7 +172,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (ABS(x[i]) > maxf) { if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]); maxf = ABS(x[i]);

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,40 +28,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
{
FLOAT amin; FLOAT amin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%2) \n\t" "vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%2) \n\t" "vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%2) \n\t" "vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%2) \n\t" "vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%2) \n\t" "vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vl %%v31,240(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmindb %%v16,%%v16,%%v24,8\n\t" "vfmindb %%v16,%%v16,%%v24,8\n\t"
"vfmindb %%v17,%%v17,%%v25,8\n\t" "vfmindb %%v17,%%v17,%%v25,8\n\t"
"vfmindb %%v18,%%v18,%%v26,8\n\t" "vfmindb %%v18,%%v18,%%v26,8\n\t"
@ -70,29 +62,23 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
"vfmindb %%v21,%%v21,%%v29,8\n\t" "vfmindb %%v21,%%v21,%%v29,8\n\t"
"vfmindb %%v22,%%v22,%%v30,8\n\t" "vfmindb %%v22,%%v22,%%v30,8\n\t"
"vfmindb %%v23,%%v23,%%v31,8\n\t" "vfmindb %%v23,%%v23,%%v31,8\n\t"
"vfmindb %%v16,%%v16,%%v20,8\n\t" "vfmindb %%v16,%%v16,%%v20,8\n\t"
"vfmindb %%v17,%%v17,%%v21,8\n\t" "vfmindb %%v17,%%v17,%%v21,8\n\t"
"vfmindb %%v18,%%v18,%%v22,8\n\t" "vfmindb %%v18,%%v18,%%v22,8\n\t"
"vfmindb %%v19,%%v19,%%v23,8\n\t" "vfmindb %%v19,%%v19,%%v23,8\n\t"
"vfmindb %%v16,%%v16,%%v18,8\n\t" "vfmindb %%v16,%%v16,%%v18,8\n\t"
"vfmindb %%v17,%%v17,%%v19,8\n\t" "vfmindb %%v17,%%v17,%%v19,8\n\t"
"vfmindb %%v16,%%v16,%%v17,8\n\t" "vfmindb %%v16,%%v16,%%v17,8\n\t"
"vfmindb %%v0,%%v0,%%v16,8\n\t" "vfmindb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t" "vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,8\n\t" "wfmindb %%v0,%%v0,%%v16,8\n\t"
"lpdr %0,%%f0 " "lpdr %[amin],%%f0"
:"=f"(amin) : [amin] "=f"(amin),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amin; return amin;
} }
@ -102,7 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
@ -112,9 +99,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = damin_kernel_32(n1, x); minf = damin_kernel_32(n1, x);
i = n1; i = n1;
} } else {
else
{
minf = ABS(x[0]); minf = ABS(x[0]);
i++; i++;
} }
@ -153,7 +138,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (ABS(x[i]) < minf) { if (ABS(x[i]) < minf) {
minf = ABS(x[i]); minf = ABS(x[i]);

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,32 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
{
FLOAT amin; FLOAT amin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t"
"vflpdb %%v0,%%v0\n\t" "vflpdb %%v0,%%v0\n\t"
"srlg %%r0,%1,5 \n\t" "srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -62,7 +55,6 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t" "vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t" "vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t" "vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t" "vfchdb %%v26,%%v21,%%v20\n\t"
@ -71,26 +63,22 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t" "vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t" "vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t" "vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t" "vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t" "vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t" "vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t" "vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t" "vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t" "vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%2) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%2) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%2) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%2) \n\t" "vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%2) \n\t" "vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%2) \n\t" "vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%2) \n\t" "vl %%v23,240(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -99,7 +87,6 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t" "vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t" "vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t" "vflpdb %%v23, %%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t" "vfchdb %%v26,%%v21,%%v20\n\t"
@ -108,29 +95,24 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t" "vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t" "vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t" "vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t" "vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t" "vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t" "vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t" "vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t" "vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t" "vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t" "vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t" "wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t" "vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %0,%%f0 " "ldr %[amin],%%f0"
:"=f"(amin) : [amin] "=f"(amin),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amin; return amin;
} }
@ -140,7 +122,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
@ -150,9 +133,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = damin_kernel_32(n1, x); minf = damin_kernel_32(n1, x);
i = n1; i = n1;
} } else {
else
{
minf = ABS(x[0]); minf = ABS(x[0]);
i++; i++;
} }
@ -191,7 +172,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (ABS(x[i]) < minf) { if (ABS(x[i]) < minf) {
minf = ABS(x[i]); minf = ABS(x[i]);

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,34 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
{
FLOAT asum; FLOAT asum;
__asm__ ( __asm__("vzero %%v24\n\t"
"vzero %%v0 \n\t" "vzero %%v25\n\t"
"vzero %%v1 \n\t" "vzero %%v26\n\t"
"vzero %%v2 \n\t" "vzero %%v27\n\t"
"vzero %%v3 \n\t" "vzero %%v28\n\t"
"srlg %%r0,%1,5 \n\t" "vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%2) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%2) \n\t" "vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%2) \n\t" "vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%2) \n\t" "vl %%v23, 112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -64,25 +61,22 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t" "vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t" "vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t" "vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16 \n\t" "vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v1,%%v1,%%v17 \n\t" "vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v2,%%v2,%%v18 \n\t" "vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v3,%%v3,%%v19 \n\t" "vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v0,%%v0,%%v20 \n\t" "vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v1,%%v1,%%v21 \n\t" "vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v2,%%v2,%%v22 \n\t" "vfadb %%v31,%%v31,%%v23\n\t"
"vfadb %%v3,%%v3,%%v23 \n\t" "vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%2) \n\t" "vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%2) \n\t" "vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%2) \n\t" "vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%2) \n\t" "vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%2) \n\t" "vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%2) \n\t" "vl %%v23, 240(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -91,28 +85,30 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t" "vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t" "vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t" "vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16 \n\t" "vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v1,%%v1,%%v17 \n\t" "vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v2,%%v2,%%v18 \n\t" "vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v3,%%v3,%%v19 \n\t" "vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v0,%%v0,%%v20 \n\t" "vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v1,%%v1,%%v21 \n\t" "vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v2,%%v2,%%v22 \n\t" "vfadb %%v31,%%v31,%%v23\n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1 \n\t" "vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v0,%%v0,%%v2 \n\t" "vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v0,%%v0,%%v3 \n\t" "vfadb %%v24,%%v24,%%v27\n\t"
"vrepg %%v1,%%v0,1 \n\t" "vfadb %%v24,%%v24,%%v28\n\t"
"adbr %%f0,%%f1 \n\t" "vfadb %%v24,%%v24,%%v29\n\t"
"ldr %0,%%f0 " "vfadb %%v24,%%v24,%%v30\n\t"
:"=f"(asum) "vfadb %%v24,%%v24,%%v31\n\t"
:"r"(n),"ZR"((const FLOAT (*)[n])x) "vrepg %%v25,%%v24,1\n\t"
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" "vfadb %%v24,%%v24,%%v25\n\t"
); "vsteg %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum; return asum;
} }
@ -123,7 +119,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
BLASLONG n1; BLASLONG n1;
if (n <= 0 || inc_x <= 0) return sumf; if (n <= 0 || inc_x <= 0)
return sumf;
if (inc_x == 1) { if (inc_x == 1) {
@ -164,9 +161,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
j++; j++;
} }
} }
return sumf; return sumf;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,107 +27,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
{ __asm__("vlrepg %%v0,%[alpha]\n\t"
__asm__ volatile( "srlg %[n],%[n],5\n\t"
"vlrepg %%v0,%3 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%1) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%1) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%1) \n\t" "vl %%v20,0(%%r1,%[y])\n\t"
"vl %%v20,0(%%r1,%2) \n\t" "vl %%v21,16(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%2) \n\t" "vl %%v22,32(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%2) \n\t" "vl %%v23,48(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%2) \n\t" "vl %%v24,64(%%r1,%[x])\n\t"
"vl %%v25,80(%%r1,%[x])\n\t"
"vl %%v26,96(%%r1,%[x])\n\t"
"vl %%v27,112(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t" "vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
"vl %%v24,64(%%r1,%1) \n\t" "vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
"vl %%v25,80(%%r1,%1) \n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
"vl %%v26,96(%%r1,%1) \n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
"vl %%v27,112(%%r1,%1) \n\t" "vst %%v16,0(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%2) \n\t" "vst %%v17,16(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%2) \n\t" "vst %%v18,32(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%2) \n\t" "vst %%v19,48(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%2) \n\t" "vst %%v24,64(%%r1,%[y])\n\t"
"vst %%v25,80(%%r1,%[y])\n\t"
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" "vst %%v26,96(%%r1,%[y])\n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" "vst %%v27,112(%%r1,%[y])\n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t" "vl %%v16,128(%%r1,%[x])\n\t"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vl %%v20,128(%%r1,%[y])\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vl %%v21,144(%%r1,%[y])\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "vl %%v22,160(%%r1,%[y])\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "vl %%v23,176(%%r1,%[y])\n\t"
"vst %%v21,80(%%r1,%2) \n\t" "vl %%v24,192(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%2) \n\t" "vl %%v25,208(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%2) \n\t" "vl %%v26,224(%%r1,%[x])\n\t"
"vl %%v27,240(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%1) \n\t" "vl %%v28,192(%%r1,%[y])\n\t"
"vl %%v17,144(%%r1,%1) \n\t" "vl %%v29,208(%%r1,%[y])\n\t"
"vl %%v18,160(%%r1,%1) \n\t" "vl %%v30,224(%%r1,%[y])\n\t"
"vl %%v19,176(%%r1,%1) \n\t" "vl %%v31,240(%%r1,%[y])\n\t"
"vl %%v20,128(%%r1,%2) \n\t"
"vl %%v21,144(%%r1,%2) \n\t"
"vl %%v22,160(%%r1,%2) \n\t"
"vl %%v23,176(%%r1,%2) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t" "vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
"vl %%v24,192(%%r1,%1) \n\t" "vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
"vl %%v25,208(%%r1,%1) \n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
"vl %%v26,224(%%r1,%1) \n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
"vl %%v27,240(%%r1,%1) \n\t" "vst %%v16,128(%%r1,%[y])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vst %%v17,144(%%r1,%[y])\n\t"
"vl %%v29,208(%%r1,%2) \n\t" "vst %%v18,160(%%r1,%[y])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vst %%v19,176(%%r1,%[y])\n\t"
"vl %%v31,240(%%r1,%2) \n\t" "vst %%v24,192(%%r1,%[y])\n\t"
"vst %%v25,208(%%r1,%[y])\n\t"
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" "vst %%v26,224(%%r1,%[y])\n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" "vst %%v27,240(%%r1,%[y])\n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,128(%%r1,%2) \n\t"
"vst %%v17,144(%%r1,%2) \n\t"
"vst %%v18,160(%%r1,%2) \n\t"
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" [alpha] "Q"(*alpha)
); : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
{ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
if ( n <= 0 ) return 0 ; if (n <= 0)
return 0;
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
@ -135,8 +124,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
daxpy_kernel_32(n1, x, y, &da); daxpy_kernel_32(n1, x, y, &da);
i = n1; i = n1;
while(i < n) while (i < n) {
{
y[i] += da * x[i]; y[i] += da * x[i];
i++; i++;
@ -144,13 +132,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
} }
return 0; return 0;
} }
BLASLONG n1 = n & -4; BLASLONG n1 = n & -4;
while(i < n1) while (i < n1) {
{
FLOAT m1 = da * x[ix]; FLOAT m1 = da * x[ix];
FLOAT m2 = da * x[ix + inc_x]; FLOAT m2 = da * x[ix + inc_x];
@ -168,8 +154,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
} }
while(i < n) while (i < n) {
{
y[iy] += da * x[ix]; y[iy] += da * x[ix];
ix += inc_x; ix += inc_x;
@ -180,5 +165,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
return 0; return 0;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,30 +27,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],5\n\t"
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,5 \n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1) \n\t" "pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%%r2) \n\t" "pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t" "mvc 0(256,%[y]),0(%[x])\n\t"
"agfi %%r1,256 \n\t" "la %[x],256(%[x])\n\t"
"agfi %%r2,256 \n\t" "la %[y],256(%[y])\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) : "m"(*(const struct { FLOAT x[n]; } *) x)
:"memory","cc","r0","r1","r2" : "cc");
);
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
if (n <= 0) return 0; if (n <= 0)
return 0;
if ((inc_x == 1) && (inc_y == 1)) { if ((inc_x == 1) && (inc_y == 1)) {
@ -66,7 +62,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
} }
} else { } else {
while (i < n) { while (i < n) {
@ -81,5 +76,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
} }
return 0; return 0;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,68 +27,78 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
{
FLOAT dot; FLOAT dot;
__asm__ volatile ( __asm__("vzero %%v0\n\t"
"vzero %%v0 \n\t" "vzero %%v1\n\t"
"srlg %%r0,%1,4 \n\t" "vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "pfd 1,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v24,0(%%r1,%3) \n\t" "vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,16(%%r1,%3) \n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vl %%v26,32(%%r1,%3) \n\t" "vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" "vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vl %%v27,48(%%r1,%3) \n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" "vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vl %%v28,64(%%r1,%3) \n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t" "vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t" "adbr %%f0,%%f1\n\t"
"ldr %0,%%f0 " "ldr %[dot],%%f0"
:"=f"(dot) : [dot] "=f"(dot),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
); : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return dot; return dot;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
{
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
FLOAT dot = 0.0; FLOAT dot = 0.0;
if ( n <= 0 ) return(dot); if (n <= 0)
return (dot);
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
@ -96,8 +106,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
dot = ddot_kernel_16(n1, x, y); dot = ddot_kernel_16(n1, x, y);
i = n1; i = n1;
while(i < n) while (i < n) {
{
dot += y[i] * x[i]; dot += y[i] * x[i];
i++; i++;
@ -105,7 +114,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
} }
return (dot); return (dot);
} }
FLOAT temp1 = 0.0; FLOAT temp1 = 0.0;
@ -113,8 +121,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
BLASLONG n1 = n & -4; BLASLONG n1 = n & -4;
while(i < n1) while (i < n1) {
{
FLOAT m1 = y[iy] * x[ix]; FLOAT m1 = y[iy] * x[ix];
FLOAT m2 = y[iy + inc_y] * x[ix + inc_x]; FLOAT m2 = y[iy + inc_y] * x[ix + inc_x];
@ -132,8 +139,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
} }
while(i < n) while (i < n) {
{
temp1 += y[iy] * x[ix]; temp1 += y[iy] * x[ix];
ix += inc_x; ix += inc_x;
@ -145,5 +151,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
return (dot); return (dot);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -29,387 +29,349 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NBMAX 2048 #define NBMAX 2048
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
{ FLOAT *alpha) {
__asm__ volatile ( register FLOAT *ap0 = ap[0];
"vlrepg %%v0,0(%5) \n\t" register FLOAT *ap1 = ap[1];
"vlrepg %%v1,8(%5) \n\t" register FLOAT *ap2 = ap[2];
"vlrepg %%v2,16(%5) \n\t" register FLOAT *ap3 = ap[3];
"vlrepg %%v3,24(%5) \n\t"
"vlrepg %%v4,%7 \n\t" __asm__("vlrepg %%v0,0(%[x])\n\t"
"vlrepg %%v1,8(%[x])\n\t"
"vlrepg %%v2,16(%[x])\n\t"
"vlrepg %%v3,24(%[x])\n\t"
"vlrepg %%v4,%[alpha]\n\t"
"vfmdb %%v0,%%v0,%%v4\n\t" "vfmdb %%v0,%%v0,%%v4\n\t"
"vfmdb %%v1,%%v1,%%v4\n\t" "vfmdb %%v1,%%v1,%%v4\n\t"
"vfmdb %%v2,%%v2,%%v4\n\t" "vfmdb %%v2,%%v2,%%v4\n\t"
"vfmdb %%v3,%%v3,%%v4\n\t" "vfmdb %%v3,%%v3,%%v4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t" "lghi %%r0,-16\n\t"
"ngr %%r0,%0 \n\t" "ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t" "ltgr %%r0,%%r0\n\t"
"jz 1f\n\t" "jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t" "srlg %%r0,%%r0,4\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%4) \n\t" "pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%6) \n\t" "pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v17,0(%%r1,%2) \n\t" "vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v18,0(%%r1,%3) \n\t" "vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v19,0(%%r1,%4) \n\t" "vl %%v20,16(%%r1,%[ap0])\n\t"
"vl %%v20,16(%%r1,%1) \n\t" "vl %%v21,16(%%r1,%[ap1])\n\t"
"vl %%v21,16(%%r1,%2) \n\t" "vl %%v22,16(%%r1,%[ap2])\n\t"
"vl %%v22,16(%%r1,%3) \n\t" "vl %%v23,16(%%r1,%[ap3])\n\t"
"vl %%v23,16(%%r1,%4) \n\t" "vl %%v24,32(%%r1,%[ap0])\n\t"
"vl %%v24,32(%%r1,%1) \n\t" "vl %%v25,32(%%r1,%[ap1])\n\t"
"vl %%v25,32(%%r1,%2) \n\t" "vl %%v26,32(%%r1,%[ap2])\n\t"
"vl %%v26,32(%%r1,%3) \n\t" "vl %%v27,32(%%r1,%[ap3])\n\t"
"vl %%v27,32(%%r1,%4) \n\t" "vl %%v28,48(%%r1,%[ap0])\n\t"
"vl %%v28,48(%%r1,%1) \n\t" "vl %%v29,48(%%r1,%[ap1])\n\t"
"vl %%v29,48(%%r1,%2) \n\t" "vl %%v30,48(%%r1,%[ap2])\n\t"
"vl %%v30,48(%%r1,%3) \n\t" "vl %%v31,48(%%r1,%[ap3])\n\t"
"vl %%v31,48(%%r1,%4) \n\t" "vl %%v4,0(%%r1,%[y])\n\t"
"vl %%v5,16(%%r1,%[y])\n\t"
"vl %%v4,0(%%r1,%6) \n\t" "vl %%v6,32(%%r1,%[y])\n\t"
"vl %%v7,48(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t" "vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t" "vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t" "vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,0(%%r1,%6) \n\t" "vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
"vl %%v4,16(%%r1,%6) \n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" "vst %%v4,0(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" "vst %%v5,16(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" "vst %%v6,32(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" "vst %%v7,48(%%r1,%[y])\n\t"
"vst %%v4,16(%%r1,%6) \n\t" "vl %%v16,64(%%r1,%[ap0])\n\t"
"vl %%v17,64(%%r1,%[ap1])\n\t"
"vl %%v4,32(%%r1,%6) \n\t" "vl %%v18,64(%%r1,%[ap2])\n\t"
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" "vl %%v19,64(%%r1,%[ap3])\n\t"
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" "vl %%v20,80(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" "vl %%v21,80(%%r1,%[ap1])\n\t"
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" "vl %%v22,80(%%r1,%[ap2])\n\t"
"vst %%v4,32(%%r1,%6) \n\t" "vl %%v23,80(%%r1,%[ap3])\n\t"
"vl %%v24,96(%%r1,%[ap0])\n\t"
"vl %%v4,48(%%r1,%6) \n\t" "vl %%v25,96(%%r1,%[ap1])\n\t"
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" "vl %%v26,96(%%r1,%[ap2])\n\t"
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" "vl %%v27,96(%%r1,%[ap3])\n\t"
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" "vl %%v28,112(%%r1,%[ap0])\n\t"
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" "vl %%v29,112(%%r1,%[ap1])\n\t"
"vst %%v4,48(%%r1,%6) \n\t" "vl %%v30,112(%%r1,%[ap2])\n\t"
"vl %%v31,112(%%r1,%[ap3])\n\t"
"vl %%v16,64(%%r1,%1) \n\t" "vl %%v4,64(%%r1,%[y])\n\t"
"vl %%v17,64(%%r1,%2) \n\t" "vl %%v5,80(%%r1,%[y])\n\t"
"vl %%v18,64(%%r1,%3) \n\t" "vl %%v6,96(%%r1,%[y])\n\t"
"vl %%v19,64(%%r1,%4) \n\t" "vl %%v7,112(%%r1,%[y])\n\t"
"vl %%v20,80(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,80(%%r1,%3) \n\t"
"vl %%v23,80(%%r1,%4) \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
"vl %%v4,64(%%r1,%6) \n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t" "vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t" "vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t" "vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,64(%%r1,%6) \n\t" "vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
"vl %%v4,80(%%r1,%6) \n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" "vst %%v4,64(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" "vst %%v5,80(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" "vst %%v6,96(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" "vst %%v7,112(%%r1,%[y])\n\t"
"vst %%v4,80(%%r1,%6) \n\t"
"vl %%v4,96(%%r1,%6) \n\t"
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,96(%%r1,%6) \n\t"
"vl %%v4,112(%%r1,%6) \n\t"
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,112(%%r1,%6) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t" "brctg %%r0,0b\n\t"
"1:\n\t" "1:\n\t"
"lghi %%r0,12\n\t" "lghi %%r0,12\n\t"
"ngr %%r0,%0 \n\t" "ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t" "ltgr %%r0,%%r0\n\t"
"jz 3f\n\t" "jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t" "srlg %%r0,%%r0,2\n\t"
"2:\n\t" "2:\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%2) \n\t" "vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%3) \n\t" "vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%4) \n\t" "vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v20,16(%%r1,%1) \n\t" "vl %%v20,16(%%r1,%[ap0])\n\t"
"vl %%v21,16(%%r1,%2) \n\t" "vl %%v21,16(%%r1,%[ap1])\n\t"
"vl %%v22,16(%%r1,%3) \n\t" "vl %%v22,16(%%r1,%[ap2])\n\t"
"vl %%v23,16(%%r1,%4) \n\t" "vl %%v23,16(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vl %%v4,0(%%r1,%6) \n\t" "vl %%v5,16(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t" "vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t" "vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t" "vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,0(%%r1,%6) \n\t" "vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
"vst %%v4,0(%%r1,%[y])\n\t"
"vl %%v4,16(%%r1,%6) \n\t" "vst %%v5,16(%%r1,%[y])\n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,16(%%r1,%6) \n\t"
"agfi %%r1,32\n\t" "agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t" "brctg %%r0,2b\n\t"
"3:\n\t" "3:\n\t"
"nop" "nop"
: : "+m"(*(struct { FLOAT x[n]; } *) y)
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
); "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
} }
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
{ FLOAT *alpha) {
__asm__ volatile ( register FLOAT *ap0 = ap[0];
"vlrepg %%v0,0(%3) \n\t" register FLOAT *ap1 = ap[1];
"vlrepg %%v1,8(%3) \n\t"
"vlrepg %%v2,%5 \n\t" __asm__("vlrepg %%v0,0(%[x])\n\t"
"vlrepg %%v1,8(%[x])\n\t"
"vlrepg %%v2,%[alpha]\n\t"
"vfmdb %%v0,%%v0,%%v2\n\t" "vfmdb %%v0,%%v0,%%v2\n\t"
"vfmdb %%v1,%%v1,%%v2\n\t" "vfmdb %%v1,%%v1,%%v2\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t" "lghi %%r0,-16\n\t"
"ngr %%r0,%0 \n\t" "ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t" "ltgr %%r0,%%r0\n\t"
"jz 1f\n\t" "jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t" "srlg %%r0,%%r0,4\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%4) \n\t" "pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v17,0(%%r1,%2) \n\t" "vl %%v18,16(%%r1,%[ap0])\n\t"
"vl %%v18,16(%%r1,%1) \n\t" "vl %%v19,16(%%r1,%[ap1])\n\t"
"vl %%v19,16(%%r1,%2) \n\t" "vl %%v20,32(%%r1,%[ap0])\n\t"
"vl %%v20,32(%%r1,%1) \n\t" "vl %%v21,32(%%r1,%[ap1])\n\t"
"vl %%v21,32(%%r1,%2) \n\t" "vl %%v22,48(%%r1,%[ap0])\n\t"
"vl %%v22,48(%%r1,%1) \n\t" "vl %%v23,48(%%r1,%[ap1])\n\t"
"vl %%v23,48(%%r1,%2) \n\t" "vl %%v24,64(%%r1,%[ap0])\n\t"
"vl %%v24,64(%%r1,%1) \n\t" "vl %%v25,64(%%r1,%[ap1])\n\t"
"vl %%v25,64(%%r1,%2) \n\t" "vl %%v26,80(%%r1,%[ap0])\n\t"
"vl %%v26,80(%%r1,%1) \n\t" "vl %%v27,80(%%r1,%[ap1])\n\t"
"vl %%v27,80(%%r1,%2) \n\t" "vl %%v28,96(%%r1,%[ap0])\n\t"
"vl %%v28,96(%%r1,%1) \n\t" "vl %%v29,96(%%r1,%[ap1])\n\t"
"vl %%v29,96(%%r1,%2) \n\t" "vl %%v30,112(%%r1,%[ap0])\n\t"
"vl %%v30,112(%%r1,%1) \n\t" "vl %%v31,112(%%r1,%[ap1])\n\t"
"vl %%v31,112(%%r1,%2) \n\t" "vl %%v2,0(%%r1,%[y])\n\t"
"vl %%v3,16(%%r1,%[y])\n\t"
"vl %%v2,0(%%r1,%4) \n\t" "vl %%v4,32(%%r1,%[y])\n\t"
"vl %%v5,48(%%r1,%[y])\n\t"
"vl %%v6,64(%%r1,%[y])\n\t"
"vl %%v7,80(%%r1,%[y])\n\t"
"vl %%v8,96(%%r1,%[y])\n\t"
"vl %%v9,112(%%r1,%[y])\n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4\n\t"
"vfmadb %%v5,%%v22,%%v0,%%v5\n\t"
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmadb %%v7,%%v26,%%v0,%%v7\n\t"
"vfmadb %%v8,%%v28,%%v0,%%v8\n\t"
"vfmadb %%v9,%%v30,%%v0,%%v9\n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2\n\t" "vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
"vst %%v2,0(%%r1,%4) \n\t" "vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4\n\t"
"vl %%v2,16(%%r1,%4) \n\t" "vfmadb %%v5,%%v23,%%v1,%%v5\n\t"
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" "vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" "vfmadb %%v7,%%v27,%%v1,%%v7\n\t"
"vst %%v2,16(%%r1,%4) \n\t" "vfmadb %%v8,%%v29,%%v1,%%v8\n\t"
"vfmadb %%v9,%%v31,%%v1,%%v9\n\t"
"vl %%v2,32(%%r1,%4) \n\t" "vst %%v2,0(%%r1,%[y])\n\t"
"vfmadb %%v2,%%v20,%%v0,%%v2 \n\t" "vst %%v3,16(%%r1,%[y])\n\t"
"vfmadb %%v2,%%v21,%%v1,%%v2 \n\t" "vst %%v4,32(%%r1,%[y])\n\t"
"vst %%v2,32(%%r1,%4) \n\t" "vst %%v5,48(%%r1,%[y])\n\t"
"vst %%v6,64(%%r1,%[y])\n\t"
"vl %%v2,48(%%r1,%4) \n\t" "vst %%v7,80(%%r1,%[y])\n\t"
"vfmadb %%v2,%%v22,%%v0,%%v2 \n\t" "vst %%v8,96(%%r1,%[y])\n\t"
"vfmadb %%v2,%%v23,%%v1,%%v2 \n\t" "vst %%v9,112(%%r1,%[y])\n\t"
"vst %%v2,48(%%r1,%4) \n\t"
"vl %%v2,64(%%r1,%4) \n\t"
"vfmadb %%v2,%%v24,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v25,%%v1,%%v2 \n\t"
"vst %%v2,64(%%r1,%4) \n\t"
"vl %%v2,80(%%r1,%4) \n\t"
"vfmadb %%v2,%%v26,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v27,%%v1,%%v2 \n\t"
"vst %%v2,80(%%r1,%4) \n\t"
"vl %%v2,96(%%r1,%4) \n\t"
"vfmadb %%v2,%%v28,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v29,%%v1,%%v2 \n\t"
"vst %%v2,96(%%r1,%4) \n\t"
"vl %%v2,112(%%r1,%4) \n\t"
"vfmadb %%v2,%%v30,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v31,%%v1,%%v2 \n\t"
"vst %%v2,112(%%r1,%4) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t" "brctg %%r0,0b\n\t"
"1:\n\t" "1:\n\t"
"lghi %%r0,12\n\t" "lghi %%r0,12\n\t"
"ngr %%r0,%0 \n\t" "ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t" "ltgr %%r0,%%r0\n\t"
"jz 3f\n\t" "jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t" "srlg %%r0,%%r0,2\n\t"
"2:\n\t" "2:\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%2) \n\t" "vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,16(%%r1,%1) \n\t" "vl %%v18,16(%%r1,%[ap0])\n\t"
"vl %%v19,16(%%r1,%2) \n\t" "vl %%v19,16(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vl %%v2,0(%%r1,%4) \n\t" "vl %%v3,16(%%r1,%[y])\n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2\n\t" "vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
"vst %%v2,0(%%r1,%4) \n\t" "vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
"vst %%v2,0(%%r1,%[y])\n\t"
"vl %%v2,16(%%r1,%4) \n\t" "vst %%v3,16(%%r1,%[y])\n\t"
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
"vst %%v2,16(%%r1,%4) \n\t"
"agfi %%r1,32\n\t" "agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t" "brctg %%r0,2b\n\t"
"3:\n\t" "3:\n\t"
"nop" "nop"
: : "+m"(*(struct { FLOAT x[n]; } *) y)
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
); "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
} }
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y,
{ FLOAT *alpha) {
__asm__ volatile ( __asm__("vlrepg %%v0,0(%[x])\n\t"
"vlrepg %%v0,0(%2) \n\t" "vlrepg %%v16,%[alpha]\n\t"
"vlrepg %%v1,%4 \n\t" "vfmdb %%v0,%%v0,%%v16\n\t"
"vfmdb %%v0,%%v0,%%v1 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"lghi %%r0,-16\n\t" "lghi %%r0,-16\n\t"
"ngr %%r0,%0 \n\t" "ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t" "ltgr %%r0,%%r0\n\t"
"jz 1f\n\t" "jz 1f\n\t"
"srlg %%r0,%%r0,4\n\t" "srlg %%r0,%%r0,4\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[a0])\n\t"
"pfd 2,1024(%%r1,%3) \n\t" "pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v17,16(%%r1,%[a0])\n\t"
"vl %%v17,16(%%r1,%1) \n\t" "vl %%v18,32(%%r1,%[a0])\n\t"
"vl %%v18,32(%%r1,%1) \n\t" "vl %%v19,48(%%r1,%[a0])\n\t"
"vl %%v19,48(%%r1,%1) \n\t" "vl %%v20,64(%%r1,%[a0])\n\t"
"vl %%v20,64(%%r1,%1) \n\t" "vl %%v21,80(%%r1,%[a0])\n\t"
"vl %%v21,80(%%r1,%1) \n\t" "vl %%v22,96(%%r1,%[a0])\n\t"
"vl %%v22,96(%%r1,%1) \n\t" "vl %%v23,112(%%r1,%[a0])\n\t"
"vl %%v23,112(%%r1,%1) \n\t" "vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v1,0(%%r1,%3) \n\t" "vl %%v26,32(%%r1,%[y])\n\t"
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" "vl %%v27,48(%%r1,%[y])\n\t"
"vst %%v1,0(%%r1,%3) \n\t" "vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v1,16(%%r1,%3) \n\t" "vl %%v30,96(%%r1,%[y])\n\t"
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" "vl %%v31,112(%%r1,%[y])\n\t"
"vst %%v1,16(%%r1,%3) \n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
"vl %%v1,32(%%r1,%3) \n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t"
"vfmadb %%v1,%%v18,%%v0,%%v1 \n\t" "vfmadb %%v27,%%v19,%%v0,%%v27\n\t"
"vst %%v1,32(%%r1,%3) \n\t" "vfmadb %%v28,%%v20,%%v0,%%v28\n\t"
"vfmadb %%v29,%%v21,%%v0,%%v29\n\t"
"vl %%v1,48(%%r1,%3) \n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t"
"vfmadb %%v1,%%v19,%%v0,%%v1 \n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t"
"vst %%v1,48(%%r1,%3) \n\t" "vst %%v24,0(%%r1,%[y])\n\t"
"vst %%v25,16(%%r1,%[y])\n\t"
"vl %%v1,64(%%r1,%3) \n\t" "vst %%v26,32(%%r1,%[y])\n\t"
"vfmadb %%v1,%%v20,%%v0,%%v1 \n\t" "vst %%v27,48(%%r1,%[y])\n\t"
"vst %%v1,64(%%r1,%3) \n\t" "vst %%v28,64(%%r1,%[y])\n\t"
"vst %%v29,80(%%r1,%[y])\n\t"
"vl %%v1,80(%%r1,%3) \n\t" "vst %%v30,96(%%r1,%[y])\n\t"
"vfmadb %%v1,%%v21,%%v0,%%v1 \n\t" "vst %%v31,112(%%r1,%[y])\n\t"
"vst %%v1,80(%%r1,%3) \n\t"
"vl %%v1,96(%%r1,%3) \n\t"
"vfmadb %%v1,%%v22,%%v0,%%v1 \n\t"
"vst %%v1,96(%%r1,%3) \n\t"
"vl %%v1,112(%%r1,%3) \n\t"
"vfmadb %%v1,%%v23,%%v0,%%v1 \n\t"
"vst %%v1,112(%%r1,%3) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t" "brctg %%r0,0b\n\t"
"1:\n\t" "1:\n\t"
"lghi %%r0,12\n\t" "lghi %%r0,12\n\t"
"ngr %%r0,%0 \n\t" "ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t" "ltgr %%r0,%%r0\n\t"
"jz 3f\n\t" "jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t" "srlg %%r0,%%r0,2\n\t"
"2:\n\t" "2:\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,16(%%r1,%1) \n\t" "vl %%v17,16(%%r1,%[a0])\n\t"
"vl %%v18,0(%%r1,%[y])\n\t"
"vl %%v1,0(%%r1,%3) \n\t" "vl %%v19,16(%%r1,%[y])\n\t"
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" "vfmadb %%v18,%%v16,%%v0,%%v18\n\t"
"vst %%v1,0(%%r1,%3) \n\t" "vfmadb %%v19,%%v17,%%v0,%%v19\n\t"
"vst %%v18,0(%%r1,%[y])\n\t"
"vl %%v1,16(%%r1,%3) \n\t" "vst %%v19,16(%%r1,%[y])\n\t"
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
"vst %%v1,16(%%r1,%3) \n\t"
"agfi %%r1,32\n\t" "agfi %%r1,32\n\t"
"brctg %%r0,2b\n\t" "brctg %%r0,2b\n\t"
"3:\n\t" "3:\n\t"
"nop" "nop"
: : "+m"(*(struct { FLOAT x[n]; } *) y)
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha),
); [n] "r"(n)
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) {
{
BLASLONG i; BLASLONG i;
for (i = 0; i < n; i++) for (i = 0; i < n; i++) {
{
*dest += src[i]; *dest += src[i];
dest += inc_dest; dest += inc_dest;
} }
} }
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
{ BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *buffer) {
BLASLONG i; BLASLONG i;
FLOAT *a_ptr; FLOAT *a_ptr;
FLOAT *x_ptr; FLOAT *x_ptr;
@ -423,8 +385,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG lda4 = lda << 2; BLASLONG lda4 = lda << 2;
FLOAT xbuffer[8], *ybuffer; FLOAT xbuffer[8], *ybuffer;
if ( m < 1 ) return(0); if (m < 1)
if ( n < 1 ) return(0); return (0);
if (n < 1)
return (0);
ybuffer = buffer; ybuffer = buffer;
@ -439,13 +403,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG NB = NBMAX; BLASLONG NB = NBMAX;
while ( NB == NBMAX ) while (NB == NBMAX) {
{
m1 -= NB; m1 -= NB;
if ( m1 < 0) if (m1 < 0) {
{ if (m2 == 0)
if ( m2 == 0 ) break; break;
NB = m2; NB = m2;
} }
@ -462,12 +425,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
else else
ybuffer = y_ptr; ybuffer = y_ptr;
if ( inc_x == 1 ) if (inc_x == 1) {
{
for (i = 0; i < n1; i++) {
for( i = 0; i < n1 ; i++)
{
dgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); dgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha);
ap[0] += lda4; ap[0] += lda4;
ap[1] += lda4; ap[1] += lda4;
@ -477,29 +437,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr += 4; x_ptr += 4;
} }
if ( n2 & 2 ) if (n2 & 2) {
{
dgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); dgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha);
a_ptr += lda * 2; a_ptr += lda * 2;
x_ptr += 2; x_ptr += 2;
} }
if (n2 & 1) {
if ( n2 & 1 )
{
dgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); dgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha);
/* a_ptr += lda; /* a_ptr += lda;
x_ptr += 1; */ x_ptr += 1; */
} }
} else {
} for (i = 0; i < n1; i++) {
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0]; xbuffer[0] = x_ptr[0];
x_ptr += inc_x; x_ptr += inc_x;
xbuffer[1] = x_ptr[0]; xbuffer[1] = x_ptr[0];
@ -516,8 +469,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
a_ptr += lda4; a_ptr += lda4;
} }
for( i = 0; i < n2 ; i++) for (i = 0; i < n2; i++) {
{
xbuffer[0] = x_ptr[0]; xbuffer[0] = x_ptr[0];
x_ptr += inc_x; x_ptr += inc_x;
dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha);
@ -528,30 +480,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
} }
a += NB; a += NB;
if ( inc_y != 1 ) if (inc_y != 1) {
{
add_y(NB, ybuffer, y_ptr, inc_y); add_y(NB, ybuffer, y_ptr, inc_y);
y_ptr += NB * inc_y; y_ptr += NB * inc_y;
} } else
else
y_ptr += NB; y_ptr += NB;
} }
if ( m3 == 0 ) return(0); if (m3 == 0)
return (0);
if ( m3 == 3 ) if (m3 == 3) {
{
a_ptr = a; a_ptr = a;
x_ptr = x; x_ptr = x;
FLOAT temp0 = 0.0; FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0; FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0; FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 ) if (lda == 3 && inc_x == 1) {
{
for( i = 0; i < ( n & -4 ); i+=4 ) for (i = 0; i < (n & -4); i += 4) {
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
@ -565,8 +513,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr += 4; x_ptr += 4;
} }
for( ; i < n; i++ ) for (; i < n; i++) {
{
temp0 += a_ptr[0] * x_ptr[0]; temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0]; temp2 += a_ptr[2] * x_ptr[0];
@ -574,19 +521,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr++; x_ptr++;
} }
} } else {
else
{
for( i = 0; i < n; i++ ) for (i = 0; i < n; i++) {
{
temp0 += a_ptr[0] * x_ptr[0]; temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0]; temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda; a_ptr += lda;
x_ptr += inc_x; x_ptr += inc_x;
} }
} }
@ -598,18 +541,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return (0); return (0);
} }
if (m3 == 2) {
if ( m3 == 2 )
{
a_ptr = a; a_ptr = a;
x_ptr = x; x_ptr = x;
FLOAT temp0 = 0.0; FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0; FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 ) if (lda == 2 && inc_x == 1) {
{
for( i = 0; i < (n & -4) ; i+=4 ) for (i = 0; i < (n & -4); i += 4) {
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
@ -619,27 +558,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
} }
for (; i < n; i++) {
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0]; temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2; a_ptr += 2;
x_ptr++; x_ptr++;
} }
} } else {
else
{
for( i = 0; i < n; i++ ) for (i = 0; i < n; i++) {
{
temp0 += a_ptr[0] * x_ptr[0]; temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda; a_ptr += lda;
x_ptr += inc_x; x_ptr += inc_x;
} }
} }
@ -649,31 +582,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return (0); return (0);
} }
if ( m3 == 1 ) if (m3 == 1) {
{
a_ptr = a; a_ptr = a;
x_ptr = x; x_ptr = x;
FLOAT temp = 0.0; FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 ) if (lda == 1 && inc_x == 1) {
{
for( i = 0; i < (n & -4); i+=4 ) for (i = 0; i < (n & -4); i += 4) {
{ temp +=
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i +
2] *
x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3];
} }
for( ; i < n; i++ ) for (; i < n; i++) {
{
temp += a_ptr[i] * x_ptr[i]; temp += a_ptr[i] * x_ptr[i];
} }
} } else {
else
{
for( i = 0; i < n; i++ ) for (i = 0; i < n; i++) {
{
temp += a_ptr[0] * x_ptr[0]; temp += a_ptr[0] * x_ptr[0];
a_ptr += lda; a_ptr += lda;
x_ptr += inc_x; x_ptr += inc_x;
@ -684,8 +613,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return (0); return (0);
} }
return (0); return (0);
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,34 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
{
FLOAT max; FLOAT max;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%2) \n\t" "vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%2) \n\t" "vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%2) \n\t" "vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%2) \n\t" "vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%2) \n\t" "vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vl %%v31,240(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxdb %%v16,%%v16,%%v24,0\n\t" "vfmaxdb %%v16,%%v16,%%v24,0\n\t"
"vfmaxdb %%v17,%%v17,%%v25,0\n\t" "vfmaxdb %%v17,%%v17,%%v25,0\n\t"
"vfmaxdb %%v18,%%v18,%%v26,0\n\t" "vfmaxdb %%v18,%%v18,%%v26,0\n\t"
@ -63,29 +59,23 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
"vfmaxdb %%v21,%%v21,%%v29,0\n\t" "vfmaxdb %%v21,%%v21,%%v29,0\n\t"
"vfmaxdb %%v22,%%v22,%%v30,0\n\t" "vfmaxdb %%v22,%%v22,%%v30,0\n\t"
"vfmaxdb %%v23,%%v23,%%v31,0\n\t" "vfmaxdb %%v23,%%v23,%%v31,0\n\t"
"vfmaxdb %%v16,%%v16,%%v20,0\n\t" "vfmaxdb %%v16,%%v16,%%v20,0\n\t"
"vfmaxdb %%v17,%%v17,%%v21,0\n\t" "vfmaxdb %%v17,%%v17,%%v21,0\n\t"
"vfmaxdb %%v18,%%v18,%%v22,0\n\t" "vfmaxdb %%v18,%%v18,%%v22,0\n\t"
"vfmaxdb %%v19,%%v19,%%v23,0\n\t" "vfmaxdb %%v19,%%v19,%%v23,0\n\t"
"vfmaxdb %%v16,%%v16,%%v18,0\n\t" "vfmaxdb %%v16,%%v16,%%v18,0\n\t"
"vfmaxdb %%v17,%%v17,%%v19,0\n\t" "vfmaxdb %%v17,%%v17,%%v19,0\n\t"
"vfmaxdb %%v16,%%v16,%%v17,0\n\t" "vfmaxdb %%v16,%%v16,%%v17,0\n\t"
"vfmaxdb %%v0,%%v0,%%v16,0\n\t" "vfmaxdb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t" "vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,0\n\t" "wfmaxdb %%v0,%%v0,%%v16,0\n\t"
"ldr %0,%%f0 " "ldr %[max],%%f0"
:"=f"(max) : [max] "=f"(max),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return max; return max;
} }
@ -95,7 +85,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
@ -105,9 +96,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = dmax_kernel_32(n1, x); maxf = dmax_kernel_32(n1, x);
i = n1; i = n1;
} } else {
else
{
maxf = x[0]; maxf = x[0];
i++; i++;
} }
@ -146,7 +135,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (x[i] > maxf) { if (x[i] > maxf) {
maxf = x[i]; maxf = x[i];

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,26 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
{
FLOAT max; FLOAT max;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t" "vfchdb %%v26,%%v20,%%v21\n\t"
@ -55,27 +51,22 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t" "vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t" "vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t" "vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t" "vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t" "vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t" "vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t" "vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t" "vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t" "vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t" "vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%2) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%2) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%2) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%2) \n\t" "vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%2) \n\t" "vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%2) \n\t" "vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%2) \n\t" "vl %%v23,240(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v25,%%v18,%%v19\n\t"
"vfchdb %%v26,%%v20,%%v21\n\t" "vfchdb %%v26,%%v20,%%v21\n\t"
@ -84,29 +75,24 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t" "vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t" "vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t" "vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v24,%%v25\n\t" "vfchdb %%v28,%%v24,%%v25\n\t"
"vfchdb %%v29,%%v26,%%v27\n\t" "vfchdb %%v29,%%v26,%%v27\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t" "vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t" "vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v28,%%v29\n\t" "vfchdb %%v30,%%v28,%%v29\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t" "vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v30,%%v0\n\t" "vfchdb %%v31,%%v30,%%v0\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t" "vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t" "wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t" "vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %0,%%f0 " "ldr %[max],%%f0"
:"=f"(max) : [max] "=f"(max),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return max; return max;
} }
@ -116,7 +102,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
@ -126,9 +113,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = dmax_kernel_32(n1, x); maxf = dmax_kernel_32(n1, x);
i = n1; i = n1;
} } else {
else
{
maxf = x[0]; maxf = x[0];
i++; i++;
} }
@ -167,7 +152,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (x[i] > maxf) { if (x[i] > maxf) {
maxf = x[i]; maxf = x[i];

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,34 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
{
FLOAT min; FLOAT min;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%2) \n\t" "vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%2) \n\t" "vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%2) \n\t" "vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%2) \n\t" "vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%2) \n\t" "vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vl %%v31,240(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmindb %%v16,%%v16,%%v24,0\n\t" "vfmindb %%v16,%%v16,%%v24,0\n\t"
"vfmindb %%v17,%%v17,%%v25,0\n\t" "vfmindb %%v17,%%v17,%%v25,0\n\t"
"vfmindb %%v18,%%v18,%%v26,0\n\t" "vfmindb %%v18,%%v18,%%v26,0\n\t"
@ -63,29 +59,23 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
"vfmindb %%v21,%%v21,%%v29,0\n\t" "vfmindb %%v21,%%v21,%%v29,0\n\t"
"vfmindb %%v22,%%v22,%%v30,0\n\t" "vfmindb %%v22,%%v22,%%v30,0\n\t"
"vfmindb %%v23,%%v23,%%v31,0\n\t" "vfmindb %%v23,%%v23,%%v31,0\n\t"
"vfmindb %%v16,%%v16,%%v20,0\n\t" "vfmindb %%v16,%%v16,%%v20,0\n\t"
"vfmindb %%v17,%%v17,%%v21,0\n\t" "vfmindb %%v17,%%v17,%%v21,0\n\t"
"vfmindb %%v18,%%v18,%%v22,0\n\t" "vfmindb %%v18,%%v18,%%v22,0\n\t"
"vfmindb %%v19,%%v19,%%v23,0\n\t" "vfmindb %%v19,%%v19,%%v23,0\n\t"
"vfmindb %%v16,%%v16,%%v18,0\n\t" "vfmindb %%v16,%%v16,%%v18,0\n\t"
"vfmindb %%v17,%%v17,%%v19,0\n\t" "vfmindb %%v17,%%v17,%%v19,0\n\t"
"vfmindb %%v16,%%v16,%%v17,0\n\t" "vfmindb %%v16,%%v16,%%v17,0\n\t"
"vfmindb %%v0,%%v0,%%v16,0\n\t" "vfmindb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t" "vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,0\n\t" "wfmindb %%v0,%%v0,%%v16,0\n\t"
"ldr %0,%%f0 " "ldr %[min],%%f0"
:"=f"(min) : [min] "=f"(min),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return min; return min;
} }
@ -95,7 +85,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
@ -105,9 +96,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = dmin_kernel_32(n1, x); minf = dmin_kernel_32(n1, x);
i = n1; i = n1;
} } else {
else
{
minf = x[0]; minf = x[0];
i++; i++;
} }
@ -146,7 +135,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (x[i] < minf) { if (x[i] < minf) {
minf = x[i]; minf = x[i];

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,26 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
{
FLOAT min; FLOAT min;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],5\n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t" "vfchdb %%v26,%%v21,%%v20\n\t"
@ -55,27 +51,22 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t" "vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t" "vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t" "vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t" "vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t" "vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t" "vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t" "vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t" "vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t" "vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%2) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%2) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%2) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%2) \n\t" "vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%2) \n\t" "vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%2) \n\t" "vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%2) \n\t" "vl %%v23,240(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v25,%%v19,%%v18\n\t"
"vfchdb %%v26,%%v21,%%v20\n\t" "vfchdb %%v26,%%v21,%%v20\n\t"
@ -84,29 +75,24 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
"vsel %%v25,%%v18,%%v19,%%v25\n\t" "vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vsel %%v26,%%v20,%%v21,%%v26\n\t" "vsel %%v26,%%v20,%%v21,%%v26\n\t"
"vsel %%v27,%%v22,%%v23,%%v27\n\t" "vsel %%v27,%%v22,%%v23,%%v27\n\t"
"vfchdb %%v28,%%v25,%%v24\n\t" "vfchdb %%v28,%%v25,%%v24\n\t"
"vfchdb %%v29,%%v27,%%v26\n\t" "vfchdb %%v29,%%v27,%%v26\n\t"
"vsel %%v28,%%v24,%%v25,%%v28\n\t" "vsel %%v28,%%v24,%%v25,%%v28\n\t"
"vsel %%v29,%%v26,%%v27,%%v29\n\t" "vsel %%v29,%%v26,%%v27,%%v29\n\t"
"vfchdb %%v30,%%v29,%%v28\n\t" "vfchdb %%v30,%%v29,%%v28\n\t"
"vsel %%v30,%%v28,%%v29,%%v30\n\t" "vsel %%v30,%%v28,%%v29,%%v30\n\t"
"vfchdb %%v31,%%v0,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t"
"vsel %%v0,%%v30,%%v0,%%v31\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t" "vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t" "wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t" "vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %0,%%f0 " "ldr %[min],%%f0"
:"=f"(min) : [min] "=f"(min),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return min; return min;
} }
@ -116,7 +102,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
@ -126,9 +113,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = dmin_kernel_32(n1, x); minf = dmin_kernel_32(n1, x);
i = n1; i = n1;
} } else {
else
{
minf = x[0]; minf = x[0];
i++; i++;
} }
@ -167,7 +152,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (x[i] < minf) { if (x[i] < minf) {
minf = x[i]; minf = x[i];

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,25 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
{ __asm__("vlrepg %%v0,%[c]\n\t"
__asm__ ( "vlrepg %%v1,%[s]\n\t"
"vlrepg %%v0,%3 \n\t" "srlg %[n],%[n],5\n\t"
"vlrepg %%v1,%4 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%1) \n\t" "vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%1) \n\t" "vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%1) \n\t" "vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%1) \n\t" "vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -63,25 +60,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v28, 0(%%r1,%1) \n\t" "vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%1) \n\t" "vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%1) \n\t" "vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%1) \n\t" "vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v20, 0(%%r1,%2) \n\t" "vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%2) \n\t" "vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%2) \n\t" "vst %%v23, 48(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%2) \n\t" "vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v24, 64(%%r1,%1) \n\t" "vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%1) \n\t" "vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%1) \n\t" "vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v27, 112(%%r1,%1) \n\t" "vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v16, 64(%%r1,%2) \n\t" "vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%2) \n\t" "vl %%v19, 112(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -99,25 +93,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v28, 64(%%r1,%1) \n\t" "vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%1) \n\t" "vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%1) \n\t" "vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%1) \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v27, 176(%%r1,%1) \n\t" "vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v16, 128(%%r1,%2) \n\t" "vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%2) \n\t" "vl %%v19, 176(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -135,25 +126,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v28, 128(%%r1,%1) \n\t" "vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%1) \n\t" "vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%1) \n\t" "vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%1) \n\t" "vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v20, 128(%%r1,%2) \n\t" "vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%2) \n\t" "vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%2) \n\t" "vst %%v23, 176(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%2) \n\t" "vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v24, 192(%%r1,%1) \n\t" "vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%1) \n\t" "vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%1) \n\t" "vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v27, 240(%%r1,%1) \n\t" "vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v16, 192(%%r1,%2) \n\t" "vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%2) \n\t" "vl %%v19, 240(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -171,39 +159,38 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v28, 192(%%r1,%1) \n\t" "vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%1) \n\t" "vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%1) \n\t" "vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%1) \n\t" "vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v20, 192(%%r1,%2) \n\t" "vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%2) \n\t" "vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%2) \n\t" "vst %%v23, 240(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) [n] "+&r"(n)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
); : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
{ FLOAT c, FLOAT s) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
FLOAT temp; FLOAT temp;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{
FLOAT cosa, sina; FLOAT cosa, sina;
cosa = c; cosa = c;
sina = s; sina = s;
@ -211,8 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
i = n1; i = n1;
} }
while(i < n) while (i < n) {
{
temp = c * x[i] + s * y[i]; temp = c * x[i] + s * y[i];
y[i] = c * y[i] - s * x[i]; y[i] = c * y[i] - s * x[i];
x[i] = temp; x[i] = temp;
@ -221,13 +207,9 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
} }
} else {
} while (i < n) {
else
{
while(i < n)
{
temp = c * x[ix] + s * y[iy]; temp = c * x[ix] + s * y[iy];
y[iy] = c * y[iy] - s * x[ix]; y[iy] = c * y[iy] - s * x[ix];
x[ix] = temp; x[ix] = temp;
@ -242,5 +224,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
return (0); return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,128 +27,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) {
{ __asm__("vlrepg %%v0,%[da]\n\t"
__asm__ volatile ( "srlg %[n],%[n],4\n\t"
"vlrepg %%v0,%1 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v24, 0(%%r1,%2) \n\t" "vl %%v24,0(%%r1,%[x])\n\t"
"vfmdb %%v24,%%v24,%%v0\n\t" "vfmdb %%v24,%%v24,%%v0\n\t"
"vst %%v24, 0(%%r1,%2) \n\t" "vst %%v24,0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%2) \n\t" "vl %%v25,16(%%r1,%[x])\n\t"
"vfmdb %%v25,%%v25,%%v0\n\t" "vfmdb %%v25,%%v25,%%v0\n\t"
"vst %%v25, 16(%%r1,%2) \n\t" "vst %%v25,16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%2) \n\t" "vl %%v26,32(%%r1,%[x])\n\t"
"vfmdb %%v26,%%v26,%%v0\n\t" "vfmdb %%v26,%%v26,%%v0\n\t"
"vst %%v26, 32(%%r1,%2) \n\t" "vst %%v26,32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%2) \n\t" "vl %%v27,48(%%r1,%[x])\n\t"
"vfmdb %%v27,%%v27,%%v0\n\t" "vfmdb %%v27,%%v27,%%v0\n\t"
"vst %%v27, 48(%%r1,%2) \n\t" "vst %%v27,48(%%r1,%[x])\n\t"
"vl %%v24, 64(%%r1,%2) \n\t" "vl %%v28,64(%%r1,%[x])\n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t" "vfmdb %%v28,%%v28,%%v0\n\t"
"vst %%v24, 64(%%r1,%2) \n\t" "vst %%v28,64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%2) \n\t" "vl %%v29,80(%%r1,%[x])\n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t" "vfmdb %%v29,%%v29,%%v0\n\t"
"vst %%v25, 80(%%r1,%2) \n\t" "vst %%v29,80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%2) \n\t" "vl %%v30,96(%%r1,%[x])\n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t" "vfmdb %%v30,%%v30,%%v0\n\t"
"vst %%v26, 96(%%r1,%2) \n\t" "vst %%v30,96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%2) \n\t" "vl %%v31,112(%%r1,%[x])\n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t" "vfmdb %%v31,%%v31,%%v0\n\t"
"vst %%v27, 112(%%r1,%2) \n\t" "vst %%v31,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) : [x] "a"(x),[da] "Q"(da)
:"memory","cc","r0","r1","v0","v24","v25","v26","v27" : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
); "v31");
} }
static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
{ __asm__("vzero %%v0\n\t"
__asm__ volatile( "srlg %[n],%[n],4\n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v24,0(%%r1,%1) \n\t" "vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v25,16(%%r1,%1) \n\t" "vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v26,32(%%r1,%1) \n\t" "vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v27,48(%%r1,%1) \n\t" "vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v24,64(%%r1,%1) \n\t" "vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v25,80(%%r1,%1) \n\t" "vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v26,96(%%r1,%1) \n\t" "vst %%v0,112(%%r1,%[x])\n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
:"r"(n),"ZR"((FLOAT (*)[n])x) : [x] "a"(x)
:"memory","cc","r0","r1","v24","v25","v26","v27" : "cc", "r1", "v0");
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
{ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0, j = 0; BLASLONG i = 0, j = 0;
if (n <= 0 || inc_x <= 0) if (n <= 0 || inc_x <= 0)
return (0); return (0);
if (inc_x == 1) {
if ( inc_x == 1 ) if (da == 0.0) {
{
if ( da == 0.0 )
{
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if (n1 > 0) {
{
dscal_kernel_16_zero(n1, x); dscal_kernel_16_zero(n1, x);
j = n1; j = n1;
} }
while(j < n) while (j < n) {
{
x[j] = 0.0; x[j] = 0.0;
j++; j++;
} }
} } else {
else
{
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if (n1 > 0) {
{
dscal_kernel_16(n1, da, x); dscal_kernel_16(n1, da, x);
j = n1; j = n1;
} }
while(j < n) while (j < n) {
{
x[j] = da * x[j]; x[j] = da * x[j];
j++; j++;
} }
} }
} else {
} if (da == 0.0) {
else
{
if ( da == 0.0 )
{
BLASLONG n1 = n & -4; BLASLONG n1 = n & -4;
@ -163,17 +141,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
j += 4; j += 4;
} }
while(j < n) while (j < n) {
{
x[i] = 0.0; x[i] = 0.0;
i += inc_x; i += inc_x;
j++; j++;
} }
} } else {
else
{
BLASLONG n1 = n & -4; BLASLONG n1 = n & -4;
while (j < n1) { while (j < n1) {
@ -188,8 +163,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
} }
while(j < n) while (j < n) {
{
x[i] = da * x[i]; x[i] = da * x[i];
i += inc_x; i += inc_x;
@ -201,5 +175,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
return 0; return 0;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project Copyright (c) 2013-2019,The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms,with or without Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are modification,are permitted provided that the following conditions are
@ -27,35 +27,38 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
{
double dot; double dot;
__asm__ volatile ( __asm__("vzero %%v0\n\t"
"vzero %%v0 \n\t" "vzero %%v1\n\t"
"srlg %%r0,%1,4 \n\t" "vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "pfd 1,1024(%%r1,%[y])\n\t"
"vlef %%v16,0(%%r1,%[x]),0\n\t"
"vlef %%v16,0(%%r1,%2),0 \n\t" "vlef %%v16,4(%%r1,%[x]),2\n\t"
"vlef %%v16,4(%%r1,%2),2 \n\t" "vlef %%v17,8(%%r1,%[x]),0\n\t"
"vlef %%v17,8(%%r1,%2),0 \n\t" "vlef %%v17,12(%%r1,%[x]),2\n\t"
"vlef %%v17,12(%%r1,%2),2 \n\t" "vlef %%v18,16(%%r1,%[x]),0\n\t"
"vlef %%v18,16(%%r1,%2),0 \n\t" "vlef %%v18,20(%%r1,%[x]),2\n\t"
"vlef %%v18,20(%%r1,%2),2 \n\t" "vlef %%v19,24(%%r1,%[x]),0\n\t"
"vlef %%v19,24(%%r1,%2),0 \n\t" "vlef %%v19,28(%%r1,%[x]),2\n\t"
"vlef %%v19,28(%%r1,%2),2 \n\t" "vlef %%v20,32(%%r1,%[x]),0\n\t"
"vlef %%v20,32(%%r1,%2),0 \n\t" "vlef %%v20,36(%%r1,%[x]),2\n\t"
"vlef %%v20,36(%%r1,%2),2 \n\t" "vlef %%v21,40(%%r1,%[x]),0\n\t"
"vlef %%v21,40(%%r1,%2),0 \n\t" "vlef %%v21,44(%%r1,%[x]),2\n\t"
"vlef %%v21,44(%%r1,%2),2 \n\t" "vlef %%v22,48(%%r1,%[x]),0\n\t"
"vlef %%v22,48(%%r1,%2),0 \n\t" "vlef %%v22,52(%%r1,%[x]),2\n\t"
"vlef %%v22,52(%%r1,%2),2 \n\t" "vlef %%v23,56(%%r1,%[x]),0\n\t"
"vlef %%v23,56(%%r1,%2),0 \n\t" "vlef %%v23,60(%%r1,%[x]),2\n\t"
"vlef %%v23,60(%%r1,%2),2 \n\t"
"vflls %%v16,%%v16\n\t" "vflls %%v16,%%v16\n\t"
"vflls %%v17,%%v17\n\t" "vflls %%v17,%%v17\n\t"
"vflls %%v18,%%v18\n\t" "vflls %%v18,%%v18\n\t"
@ -64,64 +67,70 @@ static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
"vflls %%v21,%%v21\n\t" "vflls %%v21,%%v21\n\t"
"vflls %%v22,%%v22\n\t" "vflls %%v22,%%v22\n\t"
"vflls %%v23,%%v23\n\t" "vflls %%v23,%%v23\n\t"
"vlef %%v24,0(%%r1,%[y]),0\n\t"
"vlef %%v24,0(%%r1,%3),0 \n\t" "vlef %%v24,4(%%r1,%[y]),2\n\t"
"vlef %%v24,4(%%r1,%3),2 \n\t"
"vflls %%v24,%%v24\n\t" "vflls %%v24,%%v24\n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
"vlef %%v25,8(%%r1,%3),0 \n\t" "vlef %%v25,8(%%r1,%[y]),0\n\t"
"vlef %%v25,12(%%r1,%3),2 \n\t" "vlef %%v25,12(%%r1,%[y]),2\n\t"
"vflls %%v25,%%v25\n\t" "vflls %%v25,%%v25\n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
"vlef %%v26,16(%%r1,%3),0 \n\t" "vlef %%v26,16(%%r1,%[y]),0\n\t"
"vlef %%v26,20(%%r1,%3),2 \n\t" "vlef %%v26,20(%%r1,%[y]),2\n\t"
"vflls %%v26,%%v26\n\t" "vflls %%v26,%%v26\n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
"vlef %%v27,24(%%r1,%3),0 \n\t" "vlef %%v27,24(%%r1,%[y]),0\n\t"
"vlef %%v27,28(%%r1,%3),2 \n\t" "vlef %%v27,28(%%r1,%[y]),2\n\t"
"vflls %%v27,%%v27\n\t" "vflls %%v27,%%v27\n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" "vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
"vlef %%v28,32(%%r1,%3),0 \n\t" "vlef %%v28,32(%%r1,%[y]),0\n\t"
"vlef %%v28,36(%%r1,%3),2 \n\t" "vlef %%v28,36(%%r1,%[y]),2\n\t"
"vflls %%v28,%%v28\n\t" "vflls %%v28,%%v28\n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" "vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
"vlef %%v29,40(%%r1,%3),0 \n\t" "vlef %%v29,40(%%r1,%[y]),0\n\t"
"vlef %%v29,44(%%r1,%3),2 \n\t" "vlef %%v29,44(%%r1,%[y]),2\n\t"
"vflls %%v29,%%v29\n\t" "vflls %%v29,%%v29\n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
"vlef %%v30,48(%%r1,%3),0 \n\t" "vlef %%v30,48(%%r1,%[y]),0\n\t"
"vlef %%v30,52(%%r1,%3),2 \n\t" "vlef %%v30,52(%%r1,%[y]),2\n\t"
"vflls %%v30,%%v30\n\t" "vflls %%v30,%%v30\n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" "vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
"vlef %%v31,56(%%r1,%3),0 \n\t" "vlef %%v31,56(%%r1,%[y]),0\n\t"
"vlef %%v31,60(%%r1,%3),2 \n\t" "vlef %%v31,60(%%r1,%[y]),2\n\t"
"vflls %%v31,%%v31\n\t" "vflls %%v31,%%v31\n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
"agfi %%r1,64\n\t" "agfi %%r1,64\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1\n\t"
"vfadb %%v0,%%v0,%%v2\n\t"
"vfadb %%v0,%%v0,%%v3\n\t"
"vfadb %%v0,%%v0,%%v4\n\t"
"vfadb %%v0,%%v0,%%v5\n\t"
"vfadb %%v0,%%v0,%%v6\n\t"
"vfadb %%v0,%%v0,%%v7\n\t"
"vrepg %%v1,%%v0,1\n\t" "vrepg %%v1,%%v0,1\n\t"
"adbr %%f0,%%f1\n\t" "adbr %%f0,%%f1\n\t"
"ldr %0,%%f0 " "ldr %[dot],%%f0"
:"=f"(dot) : [dot] "=f"(dot),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
); : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return dot; return dot;
} }
double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
{
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
double dot = 0.0; double dot = 0.0;
if ( n <= 0 ) return(dot); if (n <= 0)
return (dot);
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
@ -129,8 +138,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
dot = dsdot_kernel_16(n1, x, y); dot = dsdot_kernel_16(n1, x, y);
i = n1; i = n1;
while(i < n) while (i < n) {
{
dot += (double) y[i] * (double) x[i]; dot += (double) y[i] * (double) x[i];
i++; i++;
@ -138,13 +146,11 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
} }
return (dot); return (dot);
} }
BLASLONG n1 = n & -2; BLASLONG n1 = n & -2;
while(i < n1) while (i < n1) {
{
dot += (double) y[iy] * (double) x[ix]; dot += (double) y[iy] * (double) x[ix];
dot += (double) y[iy + inc_y] * (double) x[ix + inc_x]; dot += (double) y[iy + inc_y] * (double) x[ix + inc_x];
@ -154,8 +160,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
} }
while(i < n) while (i < n) {
{
dot += (double) y[iy] * (double) x[ix]; dot += (double) y[iy] * (double) x[ix];
ix += inc_x; ix += inc_x;
@ -166,5 +171,3 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
return (dot); return (dot);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,111 +27,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],5\n\t"
__asm__ volatile(
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%1) \n\t" "vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%1) \n\t" "vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%1) \n\t" "vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%1) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%1) \n\t" "vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%1) \n\t" "vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%1) \n\t" "vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%1) \n\t" "vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%1) \n\t" "vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v0, 0(%%r1,%2) \n\t" "vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%2) \n\t" "vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%2) \n\t" "vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%2) \n\t" "vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%2) \n\t" "vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%2) \n\t" "vl %%v7, 112(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%2) \n\t" "vst %%v0, 0(%%r1,%[x])\n\t"
"vl %%v7, 112(%%r1,%2) \n\t" "vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v0, 0(%%r1,%1) \n\t" "vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%1) \n\t" "vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%1) \n\t" "vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%1) \n\t" "vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%1) \n\t" "vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%1) \n\t" "vst %%v7, 112(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%1) \n\t" "vl %%v0, 128(%%r1,%[y])\n\t"
"vst %%v7, 112(%%r1,%1) \n\t" "vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v0, 128(%%r1,%2) \n\t" "vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%2) \n\t" "vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%2) \n\t" "vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%2) \n\t" "vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%2) \n\t" "vl %%v7, 240(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%2) \n\t" "vst %%v0, 128(%%r1,%[x])\n\t"
"vl %%v6, 224(%%r1,%2) \n\t" "vst %%v1, 144(%%r1,%[x])\n\t"
"vl %%v7, 240(%%r1,%2) \n\t" "vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v0, 128(%%r1,%1) \n\t" "vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%1) \n\t" "vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%1) \n\t" "vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%1) \n\t" "vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%1) \n\t" "vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%1) \n\t" "vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v6, 224(%%r1,%1) \n\t" "vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v7, 240(%%r1,%1) \n\t" "vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v16, 0(%%r1,%2) \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%2) \n\t" "vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%2) \n\t" "vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%2) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%2) \n\t" "vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%2) \n\t" "vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%2) \n\t" "vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%2) \n\t" "vst %%v31, 240(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) [n] "+&r"(n)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : [x] "a"(x),[y] "a"(y)
); : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
{ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
FLOAT temp; FLOAT temp;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 )) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{
dswap_kernel_32(n1, x, y); dswap_kernel_32(n1, x, y);
i = n1; i = n1;
} }
while(i < n) while (i < n) {
{
temp = y[i]; temp = y[i];
y[i] = x[i]; y[i] = x[i];
x[i] = temp; x[i] = temp;
@ -139,13 +133,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
} }
} else {
} while (i < n) {
else
{
while(i < n)
{
temp = y[iy]; temp = y[iy];
y[iy] = x[ix]; y[iy] = x[ix];
x[ix] = temp; x[ix] = temp;
@ -158,5 +148,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
} }
return (0); return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,26 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
{
BLASLONG iamax; BLASLONG iamax;
__asm__ volatile ( __asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v0,0(%3),0 \n\t" "vlef %%v1,4(%[x]),0\n\t"
"vlef %%v1,4(%3),0 \n\t" "vlef %%v0,8(%[x]),1\n\t"
"vlef %%v0,8(%3),1 \n\t" "vlef %%v1,12(%[x]),1\n\t"
"vlef %%v1,12(%3),1 \n\t" "vlef %%v0,16(%[x]),2\n\t"
"vlef %%v0,16(%3),2 \n\t" "vlef %%v1,20(%[x]),2\n\t"
"vlef %%v1,20(%3),2 \n\t" "vlef %%v0,24(%[x]),3\n\t"
"vlef %%v0,24(%3),3 \n\t" "vlef %%v1,28(%[x]),3\n\t"
"vlef %%v1,28(%3),3 \n\t"
"vflpsb %%v0,%%v0\n\t" "vflpsb %%v0,%%v0\n\t"
"vflpsb %%v1,%%v1\n\t" "vflpsb %%v1,%%v1\n\t"
"vfasb %%v0,%%v0,%%v1\n\t" "vfasb %%v0,%%v0,%%v1\n\t"
@ -89,31 +82,26 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vleif %%v27,13,1\n\t" "vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t" "vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t" "vleif %%v27,15,3\n\t"
"srlg %%r0,%2,5 \n\t" "srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%3) \n\t" "vl %%v28,16(%%r1,%[x])\n\t"
"vl %%v28,16(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28\n\t" "vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t" "vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%3) \n\t" "vl %%v29,48(%%r1,%[x])\n\t"
"vl %%v29,48(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29\n\t" "vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t" "vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%3) \n\t" "vl %%v30,80(%%r1,%[x])\n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30\n\t" "vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t" "vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%3) \n\t" "vl %%v31,112(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31\n\t" "vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t" "vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t" "vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t" "vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t" "vflpsb %%v18, %%v18\n\t"
@ -126,14 +114,12 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vfasb %%v17,%%v18,%%v19\n\t" "vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t" "vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t" "vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t" "vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t" "vfchesb %%v6,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t" "vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t" "vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t" "vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t" "vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t" "vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t" "vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -141,7 +127,6 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vesrlg %%v5,%%v5,32\n\t" "vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t" "vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t" "vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t" "vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t" "vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t" "vsegf %%v8,%%v7\n\t"
@ -150,27 +135,22 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v1,%%v1,%%v5,%%v7\n\t" "vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t" "vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%3) \n\t" "vl %%v28,144(%%r1,%[x])\n\t"
"vl %%v28,144(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28\n\t" "vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t" "vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%3) \n\t" "vl %%v29,176(%%r1,%[x])\n\t"
"vl %%v29,176(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29\n\t" "vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t" "vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%3) \n\t" "vl %%v30,208(%%r1,%[x])\n\t"
"vl %%v30,208(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30\n\t" "vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t" "vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%3) \n\t" "vl %%v31,240(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31\n\t" "vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t" "vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t" "vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t" "vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t" "vflpsb %%v18, %%v18\n\t"
@ -183,14 +163,12 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vfasb %%v17,%%v18,%%v19\n\t" "vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t" "vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t" "vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t" "vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t" "vfchesb %%v6,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t" "vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t" "vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t" "vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t" "vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t" "vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t" "vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -198,7 +176,6 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vesrlg %%v5,%%v5,32\n\t" "vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t" "vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t" "vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t" "vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t" "vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t" "vsegf %%v8,%%v7\n\t"
@ -207,10 +184,8 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v1,%%v1,%%v5,%%v7\n\t" "vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t" "vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t" "veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t" "vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t" "vchlg %%v5,%%v2,%%v1\n\t"
@ -221,14 +196,13 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vesrlg %%v4,%%v4,32\n\t" "vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t" "vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t" "vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t" "vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t" "vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t" "wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t" "jne 1f\n\t"
"vstef %%v0,%1,0 \n\t" "vstef %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t" "vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t" "vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t" "j 2f\n\t"
"1:\n\t" "1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t" "wfchsb %%v4,%%v2,%%v0\n\t"
@ -236,27 +210,28 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsegf %%v4,%%v4\n\t" "vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t" "vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t" "vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%1 \n\t" "ste %%f0,%[amax]\n\t"
"vlgvg %0,%%v1,0 \n\t" "vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t" "2:\n\t"
"nop" "nop"
:"=r"(iamax),"=m"(*amax) : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
); "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v25", "v26", "v27", "v28", "v29", "v30", "v31");
return iamax; return iamax;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
{
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0; BLASLONG ix = 0;
FLOAT maxf = 0; FLOAT maxf = 0;
BLASLONG max = 0; BLASLONG max = 0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(max); if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) { if (inc_x == 1) {
@ -266,18 +241,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
max = icamax_kernel_32(n1, x, &maxf); max = icamax_kernel_32(n1, x, &maxf);
ix = n1 * 2; ix = n1 * 2;
i = n1; i = n1;
} } else {
else
{
maxf = CABS1(x, 0); maxf = CABS1(x, 0);
ix += 2; ix += 2;
i++; i++;
} }
while(i < n) while (i < n) {
{ if (CABS1(x, ix) > maxf) {
if( CABS1(x,ix) > maxf )
{
max = i; max = i;
maxf = CABS1(x, ix); maxf = CABS1(x, ix);
} }
@ -291,13 +262,35 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
max = 0; max = 0;
maxf = CABS1(x, 0); maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n) BLASLONG n1 = n & -4;
{ while (i < n1) {
if( CABS1(x,ix) > maxf )
{ if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) > maxf) {
max = i + 1;
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) > maxf) {
max = i + 2;
maxf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) > maxf) {
max = i + 3;
maxf = CABS1(x, ix + 3 * inc_x2);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i; max = i;
maxf = CABS1(x, ix); maxf = CABS1(x, ix);
} }
@ -307,5 +300,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
return (max + 1); return (max + 1);
} }
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,26 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
{
BLASLONG iamin; BLASLONG iamin;
__asm__ volatile ( __asm__("vlef %%v0,0(%[x]),0\n\t"
"vlef %%v0,0(%3),0 \n\t" "vlef %%v1,4(%[x]),0\n\t"
"vlef %%v1,4(%3),0 \n\t" "vlef %%v0,8(%[x]),1\n\t"
"vlef %%v0,8(%3),1 \n\t" "vlef %%v1,12(%[x]),1\n\t"
"vlef %%v1,12(%3),1 \n\t" "vlef %%v0,16(%[x]),2\n\t"
"vlef %%v0,16(%3),2 \n\t" "vlef %%v1,20(%[x]),2\n\t"
"vlef %%v1,20(%3),2 \n\t" "vlef %%v0,24(%[x]),3\n\t"
"vlef %%v0,24(%3),3 \n\t" "vlef %%v1,28(%[x]),3\n\t"
"vlef %%v1,28(%3),3 \n\t"
"vflpsb %%v0,%%v0\n\t" "vflpsb %%v0,%%v0\n\t"
"vflpsb %%v1,%%v1\n\t" "vflpsb %%v1,%%v1\n\t"
"vfasb %%v0,%%v0,%%v1\n\t" "vfasb %%v0,%%v0,%%v1\n\t"
@ -89,31 +82,26 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vleif %%v27,13,1\n\t" "vleif %%v27,13,1\n\t"
"vleif %%v27,14,2\n\t" "vleif %%v27,14,2\n\t"
"vleif %%v27,15,3\n\t" "vleif %%v27,15,3\n\t"
"srlg %%r0,%2,5 \n\t" "srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%3) \n\t" "vl %%v28,16(%%r1,%[x])\n\t"
"vl %%v28,16(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28\n\t" "vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t" "vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%3) \n\t" "vl %%v29,48(%%r1,%[x])\n\t"
"vl %%v29,48(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29\n\t" "vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t" "vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%3) \n\t" "vl %%v30,80(%%r1,%[x])\n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30\n\t" "vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t" "vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%3) \n\t" "vl %%v31,112(%%r1,%[x])\n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31\n\t" "vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t" "vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t" "vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t" "vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t" "vflpsb %%v18, %%v18\n\t"
@ -126,14 +114,12 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vfasb %%v17,%%v18,%%v19\n\t" "vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t" "vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t" "vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t" "vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t" "vfchesb %%v6,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t" "vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t" "vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t" "vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t" "vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t" "vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t" "vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -141,7 +127,6 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vesrlg %%v5,%%v5,32\n\t" "vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t" "vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t" "vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t" "vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t" "vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t" "vsegf %%v8,%%v7\n\t"
@ -150,27 +135,22 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v1,%%v1,%%v5,%%v7\n\t" "vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t" "vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%3) \n\t" "vl %%v28,144(%%r1,%[x])\n\t"
"vl %%v28,144(%%r1,%3) \n\t"
"vpkg %%v17,%%v16,%%v28\n\t" "vpkg %%v17,%%v16,%%v28\n\t"
"vperm %%v16,%%v16,%%v28,%%v9\n\t" "vperm %%v16,%%v16,%%v28,%%v9\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%3) \n\t" "vl %%v29,176(%%r1,%[x])\n\t"
"vl %%v29,176(%%r1,%3) \n\t"
"vpkg %%v19,%%v18,%%v29\n\t" "vpkg %%v19,%%v18,%%v29\n\t"
"vperm %%v18,%%v18,%%v29,%%v9\n\t" "vperm %%v18,%%v18,%%v29,%%v9\n\t"
"vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%3) \n\t" "vl %%v30,208(%%r1,%[x])\n\t"
"vl %%v30,208(%%r1,%3) \n\t"
"vpkg %%v21,%%v20,%%v30\n\t" "vpkg %%v21,%%v20,%%v30\n\t"
"vperm %%v20,%%v20,%%v30,%%v9\n\t" "vperm %%v20,%%v20,%%v30,%%v9\n\t"
"vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%3) \n\t" "vl %%v31,240(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%3) \n\t"
"vpkg %%v23,%%v22,%%v31\n\t" "vpkg %%v23,%%v22,%%v31\n\t"
"vperm %%v22,%%v22,%%v31,%%v9\n\t" "vperm %%v22,%%v22,%%v31,%%v9\n\t"
"vflpsb %%v16, %%v16\n\t" "vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t" "vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t" "vflpsb %%v18, %%v18\n\t"
@ -183,14 +163,12 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vfasb %%v17,%%v18,%%v19\n\t" "vfasb %%v17,%%v18,%%v19\n\t"
"vfasb %%v18,%%v20,%%v21\n\t" "vfasb %%v18,%%v20,%%v21\n\t"
"vfasb %%v19,%%v22,%%v23\n\t" "vfasb %%v19,%%v22,%%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t" "vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t" "vfchesb %%v6,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v5\n\t" "vsel %%v16,%%v16,%%v17,%%v5\n\t"
"vsel %%v5,%%v24,%%v25,%%v5\n\t" "vsel %%v5,%%v24,%%v25,%%v5\n\t"
"vsel %%v17,%%v18,%%v19,%%v6\n\t" "vsel %%v17,%%v18,%%v19,%%v6\n\t"
"vsel %%v6,%%v26,%%v27,%%v6\n\t" "vsel %%v6,%%v26,%%v27,%%v6\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t" "vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t" "vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -198,7 +176,6 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vesrlg %%v5,%%v5,32\n\t" "vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t" "vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t" "vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t" "vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t" "vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t" "vsegf %%v8,%%v7\n\t"
@ -207,10 +184,8 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v1,%%v1,%%v5,%%v7\n\t" "vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t" "vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t" "veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t" "vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t" "vchlg %%v5,%%v2,%%v1\n\t"
@ -221,14 +196,13 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vesrlg %%v4,%%v4,32\n\t" "vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t" "vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t" "vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t" "vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t" "vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t" "wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t" "jne 1f\n\t"
"vstef %%v0,%1,0 \n\t" "vstef %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t" "vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t" "vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t" "j 2f\n\t"
"1:\n\t" "1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t" "wfchsb %%v4,%%v0,%%v2\n\t"
@ -236,27 +210,28 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsegf %%v4,%%v4\n\t" "vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t" "vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t" "vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%1 \n\t" "ste %%f0,%[amin]\n\t"
"vlgvg %0,%%v1,0 \n\t" "vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t" "2:\n\t"
"nop" "nop"
:"=r"(iamin),"=m"(*amin) : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
); "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v25", "v26", "v27", "v28", "v29", "v30", "v31");
return iamin; return iamin;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
{
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0; BLASLONG ix = 0;
FLOAT minf = 0; FLOAT minf = 0;
BLASLONG min = 0; BLASLONG min = 0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(min); if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) { if (inc_x == 1) {
@ -266,18 +241,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
min = icamin_kernel_32(n1, x, &minf); min = icamin_kernel_32(n1, x, &minf);
ix = n1 * 2; ix = n1 * 2;
i = n1; i = n1;
} } else {
else
{
minf = CABS1(x, 0); minf = CABS1(x, 0);
ix += 2; ix += 2;
i++; i++;
} }
while(i < n) while (i < n) {
{ if (CABS1(x, ix) < minf) {
if( CABS1(x,ix) < minf )
{
min = i; min = i;
minf = CABS1(x, ix); minf = CABS1(x, ix);
} }
@ -291,13 +262,35 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
min = 0; min = 0;
minf = CABS1(x, 0); minf = CABS1(x, 0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n) BLASLONG n1 = n & -4;
{ while (i < n1) {
if( CABS1(x,ix) < minf )
{ if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) < minf) {
min = i + 1;
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) < minf) {
min = i + 2;
minf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) < minf) {
min = i + 3;
minf = CABS1(x, ix + 3 * inc_x2);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) < minf) {
min = i; min = i;
minf = CABS1(x, ix); minf = CABS1(x, ix);
} }
@ -307,5 +300,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
return (min + 1); return (min + 1);
} }
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,18 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
{
BLASLONG iamax; BLASLONG iamax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t"
"vflpdb %%v0,%%v0\n\t" "vflpdb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t" "vleig %%v1,1,1\n\t"
@ -61,19 +55,18 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vleig %%v30,13,1\n\t" "vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t" "vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t" "vleig %%v31,15,1\n\t"
"srlg %%r0,%2,5 \n\t" "srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%3) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%3) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%3) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%3) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%3) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%3) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -82,7 +75,6 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vflpdb %%v21, %%v21\n\t" "vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t" "vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t" "vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t" "vfchedb %%v6,%%v20,%%v21\n\t"
@ -95,32 +87,28 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v6,%%v28,%%v29,%%v6\n\t" "vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t" "vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t" "vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t" "vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t" "vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t" "vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t" "vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t" "vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t" "vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t" "vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t" "vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%3) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%3) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%3) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%3) \n\t" "vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%3) \n\t" "vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%3) \n\t" "vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -129,7 +117,6 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vflpdb %%v21, %%v21\n\t" "vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t" "vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t" "vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t" "vfchedb %%v6,%%v20,%%v21\n\t"
@ -142,47 +129,43 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v6,%%v28,%%v29,%%v6\n\t" "vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t" "vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t" "vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t" "vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t" "vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t" "vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t" "vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t" "vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t" "vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t" "vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t" "vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t" "vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t" "vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t" "wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t" "jne 1f\n\t"
"vsteg %%v0,%1,0 \n\t" "vsteg %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t" "vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t" "vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t" "j 2f\n\t"
"1:\n\t" "1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t" "wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t" "vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t" "vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%1 \n\t" "std %%f0,%[amax]\n\t"
"vlgvg %0,%%v1,0 \n\t" "vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t" "2:\n\t"
"nop" "nop"
:"=r"(iamax),"=m"(*amax) : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
); "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return iamax; return iamax;
} }
@ -193,7 +176,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG max = 0; BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max); if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) { if (inc_x == 1) {
@ -203,9 +187,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
max = idamax_kernel_32(n1, x, &maxf); max = idamax_kernel_32(n1, x, &maxf);
i = n1; i = n1;
} } else {
else
{
maxf = ABS(x[0]); maxf = ABS(x[0]);
i++; i++;
} }
@ -250,7 +232,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (ABS(x[i]) > maxf) { if (ABS(x[i]) > maxf) {
max = j; max = j;

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,18 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
{
BLASLONG iamin; BLASLONG iamin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t"
"vflpdb %%v0,%%v0\n\t" "vflpdb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t" "vleig %%v1,1,1\n\t"
@ -61,19 +55,18 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vleig %%v30,13,1\n\t" "vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t" "vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t" "vleig %%v31,15,1\n\t"
"srlg %%r0,%2,5 \n\t" "srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%3) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%3) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%3) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%3) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%3) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%3) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -82,7 +75,6 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vflpdb %%v21, %%v21\n\t" "vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t" "vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t" "vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t" "vfchedb %%v6,%%v21,%%v20\n\t"
@ -95,32 +87,28 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v6,%%v28,%%v29,%%v6\n\t" "vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t" "vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t" "vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t" "vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t" "vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t" "vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t" "vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t" "vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t" "vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t" "vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t" "vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%3) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%3) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%3) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%3) \n\t" "vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%3) \n\t" "vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%3) \n\t" "vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -129,7 +117,6 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vflpdb %%v21, %%v21\n\t" "vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t" "vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t" "vflpdb %%v23, %%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t" "vfchedb %%v6,%%v21,%%v20\n\t"
@ -142,47 +129,43 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v6,%%v28,%%v29,%%v6\n\t" "vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t" "vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t" "vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t" "vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t" "vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t" "vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t" "vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t" "vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t" "vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t" "vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t" "vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t" "vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t" "vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t" "wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t" "jne 1f\n\t"
"vsteg %%v0,%1,0 \n\t" "vsteg %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t" "vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t" "vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t" "j 2f\n\t"
"1:\n\t" "1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t" "wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t" "vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t" "vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%1 \n\t" "std %%f0,%[amin]\n\t"
"vlgvg %0,%%v1,0 \n\t" "vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t" "2:\n\t"
"nop" "nop"
:"=r"(iamin),"=m"(*amin) : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
); "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return iamin; return iamin;
} }
@ -193,7 +176,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG min = 0; BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min); if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) { if (inc_x == 1) {
@ -203,9 +187,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
min = idamin_kernel_32(n1, x, &minf); min = idamin_kernel_32(n1, x, &minf);
i = n1; i = n1;
} } else {
else
{
minf = ABS(x[0]); minf = ABS(x[0]);
i++; i++;
} }
@ -250,7 +232,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (ABS(x[i]) < minf) { if (ABS(x[i]) < minf) {
min = j; min = j;

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,12 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) {
{
BLASLONG imax; BLASLONG imax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0\n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t" "vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t" "vrepig %%v2,16\n\t"
@ -53,20 +51,18 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
"vleig %%v30,13,1\n\t" "vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t" "vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t" "vleig %%v31,15,1\n\t"
"srlg %%r0,%2,5 \n\t" "srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%3) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%3) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%3) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%3) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%3) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%3) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t" "vfchedb %%v6,%%v20,%%v21\n\t"
@ -79,33 +75,28 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
"vsel %%v6,%%v28,%%v29,%%v6\n\t" "vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t" "vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t" "vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t" "vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t" "vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t" "vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t" "vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t" "vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t" "vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t" "vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t" "vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%3) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%3) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%3) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%3) \n\t" "vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%3) \n\t" "vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%3) \n\t" "vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v5,%%v18,%%v19\n\t"
"vfchedb %%v6,%%v20,%%v21\n\t" "vfchedb %%v6,%%v20,%%v21\n\t"
@ -118,47 +109,43 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
"vsel %%v6,%%v28,%%v29,%%v6\n\t" "vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t" "vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t" "vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v16,%%v17\n\t" "vfchedb %%v20,%%v16,%%v17\n\t"
"vfchedb %%v21,%%v18,%%v19\n\t" "vfchedb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t" "vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t" "vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t" "vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t" "vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t" "vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t" "vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t" "vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t" "vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t" "wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t" "jne 1f\n\t"
"vsteg %%v0,%1,0 \n\t" "vsteg %%v0,%[max],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t" "vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t" "vlgvg %[imax],%%v0,0\n\t"
"j 2f\n\t" "j 2f\n\t"
"1:\n\t" "1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t" "wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t" "vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t" "vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%1 \n\t" "std %%f0,%[max]\n\t"
"vlgvg %0,%%v1,0 \n\t" "vlgvg %[imax],%%v1,0\n\t"
"2:\n\t" "2:\n\t"
"nop" "nop"
:"=r"(imax),"=m"(*max) : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
); "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return imax; return imax;
} }
@ -169,7 +156,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG max = 0; BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max); if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) { if (inc_x == 1) {
@ -179,9 +167,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
max = idmax_kernel_32(n1, x, &maxf); max = idmax_kernel_32(n1, x, &maxf);
i = n1; i = n1;
} } else {
else
{
maxf = x[0]; maxf = x[0];
i++; i++;
} }
@ -226,7 +212,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (x[i] > maxf) { if (x[i] > maxf) {
max = j; max = j;

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,12 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) {
{
BLASLONG imin; BLASLONG imin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0\n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,1,1\n\t" "vleig %%v1,1,1\n\t"
"vrepig %%v2,16\n\t" "vrepig %%v2,16\n\t"
@ -53,20 +51,18 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
"vleig %%v30,13,1\n\t" "vleig %%v30,13,1\n\t"
"vleig %%v31,14,0\n\t" "vleig %%v31,14,0\n\t"
"vleig %%v31,15,1\n\t" "vleig %%v31,15,1\n\t"
"srlg %%r0,%2,5 \n\t" "srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%3) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%3) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%3) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%3) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%3) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%3) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t" "vfchedb %%v6,%%v21,%%v20\n\t"
@ -79,33 +75,28 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
"vsel %%v6,%%v28,%%v29,%%v6\n\t" "vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t" "vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t" "vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t" "vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t" "vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t" "vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t" "vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t" "vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t" "vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t" "vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t" "vag %%v3,%%v3,%%v2\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%3) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%3) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%3) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%3) \n\t" "vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%3) \n\t" "vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%3) \n\t" "vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v5,%%v19,%%v18\n\t"
"vfchedb %%v6,%%v21,%%v20\n\t" "vfchedb %%v6,%%v21,%%v20\n\t"
@ -118,47 +109,43 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
"vsel %%v6,%%v28,%%v29,%%v6\n\t" "vsel %%v6,%%v28,%%v29,%%v6\n\t"
"vsel %%v19,%%v22,%%v23,%%v7\n\t" "vsel %%v19,%%v22,%%v23,%%v7\n\t"
"vsel %%v7,%%v30,%%v31,%%v7\n\t" "vsel %%v7,%%v30,%%v31,%%v7\n\t"
"vfchedb %%v20,%%v17,%%v16\n\t" "vfchedb %%v20,%%v17,%%v16\n\t"
"vfchedb %%v21,%%v19,%%v18\n\t" "vfchedb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v4,%%v4,%%v5,%%v20\n\t" "vsel %%v4,%%v4,%%v5,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v5,%%v6,%%v7,%%v21\n\t" "vsel %%v5,%%v6,%%v7,%%v21\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t" "vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t" "vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t" "vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t" "vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t" "vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t" "vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t" "wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t" "jne 1f\n\t"
"vsteg %%v0,%1,0 \n\t" "vsteg %%v0,%[min],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t" "vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t" "vlgvg %[imin],%%v0,0\n\t"
"j 2f\n\t" "j 2f\n\t"
"1:\n\t" "1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t" "wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t" "vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t" "vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%1 \n\t" "std %%f0,%[min]\n\t"
"vlgvg %0,%%v1,0 \n\t" "vlgvg %[imin],%%v1,0\n\t"
"2:\n\t" "2:\n\t"
"nop" "nop"
:"=r"(imin),"=m"(*min) : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
); "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return imin; return imin;
} }
@ -169,7 +156,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG min = 0; BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min); if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) { if (inc_x == 1) {
@ -179,9 +167,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
min = idmin_kernel_32(n1, x, &minf); min = idmin_kernel_32(n1, x, &minf);
i = n1; i = n1;
} } else {
else
{
minf = x[0]; minf = x[0];
i++; i++;
} }
@ -226,7 +212,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (x[i] < minf) { if (x[i] < minf) {
min = j; min = j;

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,18 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif
static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) {
{
BLASLONG iamax; BLASLONG iamax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t"
"vflpsb %%v0,%%v0\n\t" "vflpsb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t" "vleig %%v1,2,1\n\t"
@ -79,19 +73,18 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vleif %%v31,29,1\n\t" "vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t" "vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t" "vleif %%v31,31,3\n\t"
"srlg %%r0,%2,6 \n\t" "srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%3) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%3) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%3) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%3) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%3) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%3) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16\n\t" "vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t" "vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t" "vflpsb %%v18, %%v18\n\t"
@ -100,7 +93,6 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vflpsb %%v21, %%v21\n\t" "vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t" "vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t" "vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t" "vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t" "vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t" "vfchesb %%v7,%%v20,%%v21\n\t"
@ -113,14 +105,12 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v7,%%v28,%%v29,%%v7\n\t" "vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t" "vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t" "vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t" "vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t" "vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t" "vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t" "vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t" "vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t" "vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -128,7 +118,6 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vesrlg %%v5,%%v5,32\n\t" "vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t" "vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t" "vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t" "vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t" "vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t" "vsegf %%v8,%%v7\n\t"
@ -137,15 +126,14 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v1,%%v1,%%v5,%%v7\n\t" "vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t" "vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%3) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%3) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%3) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%3) \n\t" "vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%3) \n\t" "vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%3) \n\t" "vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16\n\t" "vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t" "vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t" "vflpsb %%v18, %%v18\n\t"
@ -154,7 +142,6 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vflpsb %%v21, %%v21\n\t" "vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t" "vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t" "vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v16,%%v17\n\t" "vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t" "vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t" "vfchesb %%v7,%%v20,%%v21\n\t"
@ -167,14 +154,12 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v7,%%v28,%%v29,%%v7\n\t" "vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t" "vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t" "vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t" "vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t" "vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t" "vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t" "vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t" "vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t" "vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -182,7 +167,6 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vesrlg %%v5,%%v5,32\n\t" "vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t" "vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t" "vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t" "vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t" "vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t" "vsegf %%v8,%%v7\n\t"
@ -191,10 +175,8 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsel %%v1,%%v1,%%v5,%%v7\n\t" "vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t" "vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t" "veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t" "vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t" "vchlg %%v5,%%v2,%%v1\n\t"
@ -205,14 +187,13 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vesrlg %%v4,%%v4,32\n\t" "vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t" "vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t" "vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t" "vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t" "vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t" "wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t" "jne 1f\n\t"
"vstef %%v0,%1,0 \n\t" "vstef %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t" "vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t" "vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t" "j 2f\n\t"
"1:\n\t" "1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t" "wfchsb %%v4,%%v2,%%v0\n\t"
@ -220,14 +201,15 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
"vsegf %%v4,%%v4\n\t" "vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t" "vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t" "vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%1 \n\t" "ste %%f0,%[amax]\n\t"
"vlgvg %0,%%v1,0 \n\t" "vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t" "2:\n\t"
"nop" "nop"
:"=r"(iamax),"=m"(*amax) : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
); "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return iamax; return iamax;
} }
@ -238,7 +220,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG max = 0; BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max); if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) { if (inc_x == 1) {
@ -248,9 +231,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
max = isamax_kernel_64(n1, x, &maxf); max = isamax_kernel_64(n1, x, &maxf);
i = n1; i = n1;
} } else {
else
{
maxf = ABS(x[0]); maxf = ABS(x[0]);
i++; i++;
} }
@ -295,7 +276,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (ABS(x[i]) > maxf) { if (ABS(x[i]) > maxf) {
max = j; max = j;

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,18 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif
static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) {
{
BLASLONG iamin; BLASLONG iamin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t"
"vflpsb %%v0,%%v0\n\t" "vflpsb %%v0,%%v0\n\t"
"vleig %%v1,0,0\n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t" "vleig %%v1,2,1\n\t"
@ -79,19 +73,18 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vleif %%v31,29,1\n\t" "vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t" "vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t" "vleif %%v31,31,3\n\t"
"srlg %%r0,%2,6 \n\t" "srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%3) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%3) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%3) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%3) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%3) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%3) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16\n\t" "vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t" "vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t" "vflpsb %%v18, %%v18\n\t"
@ -100,7 +93,6 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vflpsb %%v21, %%v21\n\t" "vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t" "vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t" "vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t" "vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t" "vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t" "vfchesb %%v7,%%v21,%%v20\n\t"
@ -113,14 +105,12 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v7,%%v28,%%v29,%%v7\n\t" "vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t" "vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t" "vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t" "vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t" "vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t" "vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t" "vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t" "vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t" "vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -128,7 +118,6 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vesrlg %%v5,%%v5,32\n\t" "vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t" "vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t" "vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t" "vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t" "vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t" "vsegf %%v8,%%v7\n\t"
@ -137,15 +126,14 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v1,%%v1,%%v5,%%v7\n\t" "vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t" "vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%3) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%3) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%3) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%3) \n\t" "vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%3) \n\t" "vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%3) \n\t" "vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16\n\t" "vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t" "vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t" "vflpsb %%v18, %%v18\n\t"
@ -154,7 +142,6 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vflpsb %%v21, %%v21\n\t" "vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t" "vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t" "vflpsb %%v23, %%v23\n\t"
"vfchesb %%v5,%%v17,%%v16\n\t" "vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t" "vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t" "vfchesb %%v7,%%v21,%%v20\n\t"
@ -167,14 +154,12 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v7,%%v28,%%v29,%%v7\n\t" "vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t" "vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t" "vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t" "vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t" "vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t" "vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t" "vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t" "vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t" "vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -182,7 +167,6 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vesrlg %%v5,%%v5,32\n\t" "vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t" "vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t" "vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t" "vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t" "vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t" "vsegf %%v8,%%v7\n\t"
@ -191,10 +175,8 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsel %%v1,%%v1,%%v5,%%v7\n\t" "vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t" "vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t" "veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t" "vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t" "vchlg %%v5,%%v2,%%v1\n\t"
@ -205,14 +187,13 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vesrlg %%v4,%%v4,32\n\t" "vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t" "vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t" "vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t" "vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t" "vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t" "wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t" "jne 1f\n\t"
"vstef %%v0,%1,0 \n\t" "vstef %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t" "vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t" "vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t" "j 2f\n\t"
"1:\n\t" "1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t" "wfchsb %%v4,%%v0,%%v2\n\t"
@ -220,14 +201,15 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
"vsegf %%v4,%%v4\n\t" "vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t" "vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t" "vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%1 \n\t" "ste %%f0,%[amin]\n\t"
"vlgvg %0,%%v1,0 \n\t" "vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t" "2:\n\t"
"nop" "nop"
:"=r"(iamin),"=m"(*amin) : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
); "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return iamin; return iamin;
} }
@ -238,7 +220,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG min = 0; BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min); if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) { if (inc_x == 1) {
@ -248,9 +231,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
min = isamin_kernel_64(n1, x, &minf); min = isamin_kernel_64(n1, x, &minf);
i = n1; i = n1;
} } else {
else
{
minf = ABS(x[0]); minf = ABS(x[0]);
i++; i++;
} }
@ -295,7 +276,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (ABS(x[i]) < minf) { if (ABS(x[i]) < minf) {
min = j; min = j;

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,12 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) {
{
BLASLONG imax; BLASLONG imax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0\n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t" "vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t" "vleig %%v2,1,0\n\t"
@ -71,20 +69,18 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vleif %%v31,29,1\n\t" "vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t" "vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t" "vleif %%v31,31,3\n\t"
"srlg %%r0,%2,6 \n\t" "srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%3) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%3) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%3) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%3) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%3) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%3) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchesb %%v5,%%v16,%%v17\n\t" "vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t" "vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t" "vfchesb %%v7,%%v20,%%v21\n\t"
@ -97,14 +93,12 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vsel %%v7,%%v28,%%v29,%%v7\n\t" "vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t" "vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t" "vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t" "vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t" "vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t" "vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t" "vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t" "vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t" "vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -112,7 +106,6 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vesrlg %%v5,%%v5,32\n\t" "vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t" "vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t" "vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t" "vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t" "vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t" "vsegf %%v8,%%v7\n\t"
@ -121,16 +114,14 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vsel %%v1,%%v1,%%v5,%%v7\n\t" "vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t" "vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%3) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%3) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%3) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%3) \n\t" "vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%3) \n\t" "vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%3) \n\t" "vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchesb %%v5,%%v16,%%v17\n\t" "vfchesb %%v5,%%v16,%%v17\n\t"
"vfchesb %%v6,%%v18,%%v19\n\t" "vfchesb %%v6,%%v18,%%v19\n\t"
"vfchesb %%v7,%%v20,%%v21\n\t" "vfchesb %%v7,%%v20,%%v21\n\t"
@ -143,14 +134,12 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vsel %%v7,%%v28,%%v29,%%v7\n\t" "vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t" "vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t" "vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v16,%%v17\n\t" "vfchesb %%v20,%%v16,%%v17\n\t"
"vfchesb %%v21,%%v18,%%v19\n\t" "vfchesb %%v21,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t" "vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t" "vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v16,%%v17\n\t" "vfchesb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t" "vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -158,7 +147,6 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vesrlg %%v5,%%v5,32\n\t" "vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t" "vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t" "vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v0,%%v16\n\t" "vfchesb %%v7,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t" "vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t" "vsegf %%v8,%%v7\n\t"
@ -167,10 +155,8 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vsel %%v1,%%v1,%%v5,%%v7\n\t" "vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t" "vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t" "veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v0,%%v3\n\t" "vfchsb %%v4,%%v0,%%v3\n\t"
"vchlg %%v5,%%v2,%%v1\n\t" "vchlg %%v5,%%v2,%%v1\n\t"
@ -181,14 +167,13 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vesrlg %%v4,%%v4,32\n\t" "vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t" "vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t" "vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t" "vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t" "vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t" "wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t" "jne 1f\n\t"
"vstef %%v0,%1,0 \n\t" "vstef %%v0,%[max],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t" "vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t" "vlgvg %[imax],%%v0,0\n\t"
"j 2f\n\t" "j 2f\n\t"
"1:\n\t" "1:\n\t"
"wfchsb %%v4,%%v2,%%v0\n\t" "wfchsb %%v4,%%v2,%%v0\n\t"
@ -196,14 +181,15 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
"vsegf %%v4,%%v4\n\t" "vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t" "vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t" "vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%1 \n\t" "ste %%f0,%[max]\n\t"
"vlgvg %0,%%v1,0 \n\t" "vlgvg %[imax],%%v1,0\n\t"
"2:\n\t" "2:\n\t"
"nop" "nop"
:"=r"(imax),"=m"(*max) : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
); "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return imax; return imax;
} }
@ -214,7 +200,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG max = 0; BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max); if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) { if (inc_x == 1) {
@ -224,9 +211,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
max = ismax_kernel_64(n1, x, &maxf); max = ismax_kernel_64(n1, x, &maxf);
i = n1; i = n1;
} } else {
else
{
maxf = x[0]; maxf = x[0];
i++; i++;
} }
@ -271,7 +256,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (x[i] > maxf) { if (x[i] > maxf) {
max = j; max = j;

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,12 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) {
{
BLASLONG imin; BLASLONG imin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0\n\t" "vleig %%v1,0,0\n\t"
"vleig %%v1,2,1\n\t" "vleig %%v1,2,1\n\t"
"vleig %%v2,1,0\n\t" "vleig %%v2,1,0\n\t"
@ -71,20 +69,18 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vleif %%v31,29,1\n\t" "vleif %%v31,29,1\n\t"
"vleif %%v31,30,2\n\t" "vleif %%v31,30,2\n\t"
"vleif %%v31,31,3\n\t" "vleif %%v31,31,3\n\t"
"srlg %%r0,%2,6 \n\t" "srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%3) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%3) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%3) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%3) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%3) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%3) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchesb %%v5,%%v17,%%v16\n\t" "vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t" "vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t" "vfchesb %%v7,%%v21,%%v20\n\t"
@ -97,14 +93,12 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vsel %%v7,%%v28,%%v29,%%v7\n\t" "vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t" "vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t" "vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t" "vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t" "vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t" "vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t" "vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t" "vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t" "vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -112,7 +106,6 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vesrlg %%v5,%%v5,32\n\t" "vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t" "vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t" "vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t" "vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t" "vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t" "vsegf %%v8,%%v7\n\t"
@ -121,16 +114,14 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vsel %%v1,%%v1,%%v5,%%v7\n\t" "vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t" "vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vl %%v16,128(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%3) \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v17,144(%%r1,%3) \n\t" "vl %%v18,160(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%3) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vl %%v19,176(%%r1,%3) \n\t" "vl %%v20,192(%%r1,%[x])\n\t"
"vl %%v20,192(%%r1,%3) \n\t" "vl %%v21,208(%%r1,%[x])\n\t"
"vl %%v21,208(%%r1,%3) \n\t" "vl %%v22,224(%%r1,%[x])\n\t"
"vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%[x])\n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchesb %%v5,%%v17,%%v16\n\t" "vfchesb %%v5,%%v17,%%v16\n\t"
"vfchesb %%v6,%%v19,%%v18\n\t" "vfchesb %%v6,%%v19,%%v18\n\t"
"vfchesb %%v7,%%v21,%%v20\n\t" "vfchesb %%v7,%%v21,%%v20\n\t"
@ -143,14 +134,12 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vsel %%v7,%%v28,%%v29,%%v7\n\t" "vsel %%v7,%%v28,%%v29,%%v7\n\t"
"vsel %%v19,%%v22,%%v23,%%v8\n\t" "vsel %%v19,%%v22,%%v23,%%v8\n\t"
"vsel %%v8,%%v30,%%v31,%%v8\n\t" "vsel %%v8,%%v30,%%v31,%%v8\n\t"
"vfchesb %%v20,%%v17,%%v16\n\t" "vfchesb %%v20,%%v17,%%v16\n\t"
"vfchesb %%v21,%%v19,%%v18\n\t" "vfchesb %%v21,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v20\n\t" "vsel %%v16,%%v16,%%v17,%%v20\n\t"
"vsel %%v5,%%v5,%%v6,%%v20\n\t" "vsel %%v5,%%v5,%%v6,%%v20\n\t"
"vsel %%v17,%%v18,%%v19,%%v21\n\t" "vsel %%v17,%%v18,%%v19,%%v21\n\t"
"vsel %%v6,%%v7,%%v8,%%v21\n\t" "vsel %%v6,%%v7,%%v8,%%v21\n\t"
"vfchesb %%v18,%%v17,%%v16\n\t" "vfchesb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v5,%%v5,%%v6,%%v18\n\t" "vsel %%v5,%%v5,%%v6,%%v18\n\t"
@ -158,7 +147,6 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vesrlg %%v5,%%v5,32\n\t" "vesrlg %%v5,%%v5,32\n\t"
"vag %%v5,%%v5,%%v4\n\t" "vag %%v5,%%v5,%%v4\n\t"
"vag %%v6,%%v6,%%v4\n\t" "vag %%v6,%%v6,%%v4\n\t"
"vfchesb %%v7,%%v16,%%v0\n\t" "vfchesb %%v7,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v7\n\t" "vsel %%v0,%%v0,%%v16,%%v7\n\t"
"vsegf %%v8,%%v7\n\t" "vsegf %%v8,%%v7\n\t"
@ -167,10 +155,8 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vsel %%v1,%%v1,%%v5,%%v7\n\t" "vsel %%v1,%%v1,%%v5,%%v7\n\t"
"vsel %%v2,%%v2,%%v6,%%v8\n\t" "vsel %%v2,%%v2,%%v6,%%v8\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"veslg %%v3,%%v0,32\n\t" "veslg %%v3,%%v0,32\n\t"
"vfchsb %%v4,%%v3,%%v0\n\t" "vfchsb %%v4,%%v3,%%v0\n\t"
"vchlg %%v5,%%v2,%%v1\n\t" "vchlg %%v5,%%v2,%%v1\n\t"
@ -181,14 +167,13 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vesrlg %%v4,%%v4,32\n\t" "vesrlg %%v4,%%v4,32\n\t"
"vsegf %%v4,%%v4\n\t" "vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v1,%%v2,%%v4\n\t" "vsel %%v1,%%v1,%%v2,%%v4\n\t"
"vrepf %%v2,%%v0,2\n\t" "vrepf %%v2,%%v0,2\n\t"
"vrepg %%v3,%%v1,1\n\t" "vrepg %%v3,%%v1,1\n\t"
"wfcsb %%v2,%%v0\n\t" "wfcsb %%v2,%%v0\n\t"
"jne 1f\n\t" "jne 1f\n\t"
"vstef %%v0,%1,0 \n\t" "vstef %%v0,%[min],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t" "vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t" "vlgvg %[imin],%%v0,0\n\t"
"j 2f\n\t" "j 2f\n\t"
"1:\n\t" "1:\n\t"
"wfchsb %%v4,%%v0,%%v2\n\t" "wfchsb %%v4,%%v0,%%v2\n\t"
@ -196,14 +181,15 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
"vsegf %%v4,%%v4\n\t" "vsegf %%v4,%%v4\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t" "vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t" "vsel %%v0,%%v2,%%v0,%%v4\n\t"
"ste %%f0,%1 \n\t" "ste %%f0,%[min]\n\t"
"vlgvg %0,%%v1,0 \n\t" "vlgvg %[imin],%%v1,0\n\t"
"2:\n\t" "2:\n\t"
"nop" "nop"
:"=r"(imin),"=m"(*min) : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
); "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return imin; return imin;
} }
@ -214,7 +200,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG min = 0; BLASLONG min = 0;
if (n <= 0 || inc_x <= 0) return (min); if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) { if (inc_x == 1) {
@ -224,9 +211,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
min = ismin_kernel_64(n1, x, &minf); min = ismin_kernel_64(n1, x, &minf);
i = n1; i = n1;
} } else {
else
{
minf = x[0]; minf = x[0];
i++; i++;
} }
@ -271,7 +256,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (x[i] < minf) { if (x[i] < minf) {
min = j; min = j;

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,22 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) {
{
BLASLONG iamax; BLASLONG iamax;
__asm__ volatile ( __asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v0,0(%3),0 \n\t" "vleg %%v1,8(%[x]),0\n\t"
"vleg %%v1,8(%3),0 \n\t" "vleg %%v0,16(%[x]),1\n\t"
"vleg %%v0,16(%3),1 \n\t" "vleg %%v1,24(%[x]),1\n\t"
"vleg %%v1,24(%3),1 \n\t"
"vflpdb %%v0,%%v0\n\t" "vflpdb %%v0,%%v0\n\t"
"vflpdb %%v1,%%v1\n\t" "vflpdb %%v1,%%v1\n\t"
"vfadb %%v0,%%v0,%%v1\n\t" "vfadb %%v0,%%v0,%%v1\n\t"
@ -59,27 +52,26 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
"vleig %%v26,5,1\n\t" "vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t" "vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t" "vleig %%v27,7,1\n\t"
"srlg %%r0,%2,4 \n\t" "srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v16,0(%%r1,%3),0 \n\t" "vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%3),0 \n\t" "vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v16,16(%%r1,%3),1 \n\t" "vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%3),1 \n\t" "vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v18,32(%%r1,%3),0 \n\t" "vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%3),0 \n\t" "vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v18,48(%%r1,%3),1 \n\t" "vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%3),1 \n\t" "vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v20,64(%%r1,%3),0 \n\t" "vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%3),0 \n\t" "vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v20,80(%%r1,%3),1 \n\t" "vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%3),1 \n\t" "vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v22,96(%%r1,%3),0 \n\t" "vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%3),0 \n\t" "vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v22,112(%%r1,%3),1 \n\t" "vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -92,40 +84,36 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
"vfadb %%v17,%%v18,%%v19\n\t" "vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t" "vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t" "vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v5,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t" "vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t" "vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t" "vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t" "vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t" "vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t" "vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t" "vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t" "vag %%v3,%%v3,%%v2\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v16,128(%%r1,%3),0 \n\t" "vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t" "vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t" "vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t" "vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t" "vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t" "vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t" "vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t" "vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t" "vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t" "vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t" "vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t" "vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t" "vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t" "vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t" "vleg %%v23,248(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -138,60 +126,55 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
"vfadb %%v17,%%v18,%%v19\n\t" "vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t" "vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t" "vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v4,%%v16,%%v17\n\t"
"vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v5,%%v18,%%v19\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t" "vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t" "vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t" "vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t" "vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v16,%%v17\n\t" "vfchedb %%v18,%%v16,%%v17\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t" "vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v0,%%v16\n\t" "vfchedb %%v5,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t" "vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t" "vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t" "vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t" "wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t" "jne 1f\n\t"
"vsteg %%v0,%1,0 \n\t" "vsteg %%v0,%[amax],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t" "vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t" "vlgvg %[iamax],%%v0,0\n\t"
"j 2f\n\t" "j 2f\n\t"
"1:\n\t" "1:\n\t"
"wfchdb %%v4,%%v2,%%v0\n\t" "wfchdb %%v4,%%v2,%%v0\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t" "vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t" "vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%1 \n\t" "std %%f0,%[amax]\n\t"
"vlgvg %0,%%v1,0 \n\t" "vlgvg %[iamax],%%v1,0\n\t"
"2:\n\t" "2:\n\t"
"nop" "nop"
:"=r"(iamax),"=m"(*amax) : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
); "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
return iamax; return iamax;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
{
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0; BLASLONG ix = 0;
FLOAT maxf = 0; FLOAT maxf = 0;
BLASLONG max = 0; BLASLONG max = 0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(max); if (n <= 0 || inc_x <= 0)
return (max);
if (inc_x == 1) { if (inc_x == 1) {
@ -201,18 +184,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
max = izamax_kernel_16(n1, x, &maxf); max = izamax_kernel_16(n1, x, &maxf);
ix = n1 * 2; ix = n1 * 2;
i = n1; i = n1;
} } else {
else
{
maxf = CABS1(x, 0); maxf = CABS1(x, 0);
ix += 2; ix += 2;
i++; i++;
} }
while(i < n) while (i < n) {
{ if (CABS1(x, ix) > maxf) {
if( CABS1(x,ix) > maxf )
{
max = i; max = i;
maxf = CABS1(x, ix); maxf = CABS1(x, ix);
} }
@ -226,13 +205,35 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
max = 0; max = 0;
maxf = CABS1(x, 0); maxf = CABS1(x, 0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n) BLASLONG n1 = n & -4;
{ while (i < n1) {
if( CABS1(x,ix) > maxf )
{ if (CABS1(x, ix) > maxf) {
max = i;
maxf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) > maxf) {
max = i + 1;
maxf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) > maxf) {
max = i + 2;
maxf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) > maxf) {
max = i + 3;
maxf = CABS1(x, ix + 3 * inc_x2);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) > maxf) {
max = i; max = i;
maxf = CABS1(x, ix); maxf = CABS1(x, ix);
} }
@ -242,5 +243,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
return (max + 1); return (max + 1);
} }
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,22 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) {
{
BLASLONG iamin; BLASLONG iamin;
__asm__ volatile ( __asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v0,0(%3),0 \n\t" "vleg %%v1,8(%[x]),0\n\t"
"vleg %%v1,8(%3),0 \n\t" "vleg %%v0,16(%[x]),1\n\t"
"vleg %%v0,16(%3),1 \n\t" "vleg %%v1,24(%[x]),1\n\t"
"vleg %%v1,24(%3),1 \n\t"
"vflpdb %%v0,%%v0\n\t" "vflpdb %%v0,%%v0\n\t"
"vflpdb %%v1,%%v1\n\t" "vflpdb %%v1,%%v1\n\t"
"vfadb %%v0,%%v0,%%v1\n\t" "vfadb %%v0,%%v0,%%v1\n\t"
@ -59,27 +52,26 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
"vleig %%v26,5,1\n\t" "vleig %%v26,5,1\n\t"
"vleig %%v27,6,0\n\t" "vleig %%v27,6,0\n\t"
"vleig %%v27,7,1\n\t" "vleig %%v27,7,1\n\t"
"srlg %%r0,%2,4 \n\t" "srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%3) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v16,0(%%r1,%3),0 \n\t" "vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%3),0 \n\t" "vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v16,16(%%r1,%3),1 \n\t" "vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%3),1 \n\t" "vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v18,32(%%r1,%3),0 \n\t" "vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%3),0 \n\t" "vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v18,48(%%r1,%3),1 \n\t" "vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%3),1 \n\t" "vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v20,64(%%r1,%3),0 \n\t" "vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%3),0 \n\t" "vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v20,80(%%r1,%3),1 \n\t" "vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%3),1 \n\t" "vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v22,96(%%r1,%3),0 \n\t" "vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%3),0 \n\t" "vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v22,112(%%r1,%3),1 \n\t" "vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -92,40 +84,36 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
"vfadb %%v17,%%v18,%%v19\n\t" "vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t" "vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t" "vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v5,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t" "vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t" "vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t" "vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t" "vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t" "vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t" "vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t" "vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t" "vag %%v3,%%v3,%%v2\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v16,128(%%r1,%3),0 \n\t" "vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t" "vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t" "vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t" "vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t" "vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t" "vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t" "vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t" "vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t" "vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t" "vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t" "vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t" "vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t" "vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t" "vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t" "vleg %%v23,248(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -138,60 +126,55 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
"vfadb %%v17,%%v18,%%v19\n\t" "vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t" "vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t" "vfadb %%v19,%%v22,%%v23\n\t"
"vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v4,%%v17,%%v16\n\t"
"vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v5,%%v19,%%v18\n\t"
"vsel %%v16,%%v16,%%v17,%%v4\n\t" "vsel %%v16,%%v16,%%v17,%%v4\n\t"
"vsel %%v4,%%v24,%%v25,%%v4\n\t" "vsel %%v4,%%v24,%%v25,%%v4\n\t"
"vsel %%v17,%%v18,%%v19,%%v5\n\t" "vsel %%v17,%%v18,%%v19,%%v5\n\t"
"vsel %%v5,%%v26,%%v27,%%v5\n\t" "vsel %%v5,%%v26,%%v27,%%v5\n\t"
"vfchedb %%v18,%%v17,%%v16\n\t" "vfchedb %%v18,%%v17,%%v16\n\t"
"vsel %%v16,%%v16,%%v17,%%v18\n\t" "vsel %%v16,%%v16,%%v17,%%v18\n\t"
"vsel %%v4,%%v4,%%v5,%%v18\n\t" "vsel %%v4,%%v4,%%v5,%%v18\n\t"
"vag %%v4,%%v4,%%v3\n\t" "vag %%v4,%%v4,%%v3\n\t"
"vfchedb %%v5,%%v16,%%v0\n\t" "vfchedb %%v5,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v0,%%v0,%%v16,%%v5\n\t"
"vsel %%v1,%%v1,%%v4,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t"
"vag %%v3,%%v3,%%v2\n\t" "vag %%v3,%%v3,%%v2\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v2,%%v0,1\n\t" "vrepg %%v2,%%v0,1\n\t"
"vrepg %%v3,%%v1,1\n\t" "vrepg %%v3,%%v1,1\n\t"
"wfcdb %%v2,%%v0\n\t" "wfcdb %%v2,%%v0\n\t"
"jne 1f\n\t" "jne 1f\n\t"
"vsteg %%v0,%1,0 \n\t" "vsteg %%v0,%[amin],0\n\t"
"vmnlg %%v0,%%v1,%%v3\n\t" "vmnlg %%v0,%%v1,%%v3\n\t"
"vlgvg %0,%%v0,0 \n\t" "vlgvg %[iamin],%%v0,0\n\t"
"j 2f\n\t" "j 2f\n\t"
"1:\n\t" "1:\n\t"
"wfchdb %%v4,%%v0,%%v2\n\t" "wfchdb %%v4,%%v0,%%v2\n\t"
"vsel %%v1,%%v3,%%v1,%%v4\n\t" "vsel %%v1,%%v3,%%v1,%%v4\n\t"
"vsel %%v0,%%v2,%%v0,%%v4\n\t" "vsel %%v0,%%v2,%%v0,%%v4\n\t"
"std %%f0,%1 \n\t" "std %%f0,%[amin]\n\t"
"vlgvg %0,%%v1,0 \n\t" "vlgvg %[iamin],%%v1,0\n\t"
"2:\n\t" "2:\n\t"
"nop" "nop"
:"=r"(iamin),"=m"(*amin) : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
); "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
return iamin; return iamin;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
{
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0; BLASLONG ix = 0;
FLOAT minf = 0; FLOAT minf = 0;
BLASLONG min = 0; BLASLONG min = 0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(min); if (n <= 0 || inc_x <= 0)
return (min);
if (inc_x == 1) { if (inc_x == 1) {
@ -201,18 +184,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
min = izamin_kernel_16(n1, x, &minf); min = izamin_kernel_16(n1, x, &minf);
ix = n1 * 2; ix = n1 * 2;
i = n1; i = n1;
} } else {
else
{
minf = CABS1(x, 0); minf = CABS1(x, 0);
ix += 2; ix += 2;
i++; i++;
} }
while(i < n) while (i < n) {
{ if (CABS1(x, ix) < minf) {
if( CABS1(x,ix) < minf )
{
min = i; min = i;
minf = CABS1(x, ix); minf = CABS1(x, ix);
} }
@ -226,13 +205,35 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
min = 0; min = 0;
minf = CABS1(x, 0); minf = CABS1(x, 0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
ix += inc_x2;
i++;
while(i < n) BLASLONG n1 = n & -4;
{ while (i < n1) {
if( CABS1(x,ix) < minf )
{ if (CABS1(x, ix) < minf) {
min = i;
minf = CABS1(x, ix);
}
if (CABS1(x, ix + inc_x2) < minf) {
min = i + 1;
minf = CABS1(x, ix + inc_x2);
}
if (CABS1(x, ix + 2 * inc_x2) < minf) {
min = i + 2;
minf = CABS1(x, ix + 2 * inc_x2);
}
if (CABS1(x, ix + 3 * inc_x2) < minf) {
min = i + 3;
minf = CABS1(x, ix + 3 * inc_x2);
}
ix += inc_x2 * 4;
i += 4;
}
while (i < n) {
if (CABS1(x, ix) < minf) {
min = i; min = i;
minf = CABS1(x, ix); minf = CABS1(x, ix);
} }
@ -242,5 +243,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
return (min + 1); return (min + 1);
} }
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,40 +28,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif
static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) {
{
FLOAT amax; FLOAT amax;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],6\n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%2) \n\t" "vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%2) \n\t" "vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%2) \n\t" "vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%2) \n\t" "vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%2) \n\t" "vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vl %%v31,240(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxsb %%v16,%%v16,%%v24,8\n\t" "vfmaxsb %%v16,%%v16,%%v24,8\n\t"
"vfmaxsb %%v17,%%v17,%%v25,8\n\t" "vfmaxsb %%v17,%%v17,%%v25,8\n\t"
"vfmaxsb %%v18,%%v18,%%v26,8\n\t" "vfmaxsb %%v18,%%v18,%%v26,8\n\t"
@ -70,32 +62,25 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x)
"vfmaxsb %%v21,%%v21,%%v29,8\n\t" "vfmaxsb %%v21,%%v21,%%v29,8\n\t"
"vfmaxsb %%v22,%%v22,%%v30,8\n\t" "vfmaxsb %%v22,%%v22,%%v30,8\n\t"
"vfmaxsb %%v23,%%v23,%%v31,8\n\t" "vfmaxsb %%v23,%%v23,%%v31,8\n\t"
"vfmaxsb %%v16,%%v16,%%v20,8\n\t" "vfmaxsb %%v16,%%v16,%%v20,8\n\t"
"vfmaxsb %%v17,%%v17,%%v21,8\n\t" "vfmaxsb %%v17,%%v17,%%v21,8\n\t"
"vfmaxsb %%v18,%%v18,%%v22,8\n\t" "vfmaxsb %%v18,%%v18,%%v22,8\n\t"
"vfmaxsb %%v19,%%v19,%%v23,8\n\t" "vfmaxsb %%v19,%%v19,%%v23,8\n\t"
"vfmaxsb %%v16,%%v16,%%v18,8\n\t" "vfmaxsb %%v16,%%v16,%%v18,8\n\t"
"vfmaxsb %%v17,%%v17,%%v19,8\n\t" "vfmaxsb %%v17,%%v17,%%v19,8\n\t"
"vfmaxsb %%v16,%%v16,%%v17,8\n\t" "vfmaxsb %%v16,%%v16,%%v17,8\n\t"
"vfmaxsb %%v0,%%v0,%%v16,8\n\t" "vfmaxsb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t" "veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,8\n\t" "vfmaxsb %%v0,%%v0,%%v16,8\n\t"
"vrepf %%v16,%%v0,2\n\t" "vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,8\n\t" "wfmaxsb %%v0,%%v0,%%v16,8\n\t"
"lper %0,%%f0 " "lper %[amax],%%f0"
:"=f"(amax) : [amax] "=f"(amax),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amax; return amax;
} }
@ -105,7 +90,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
@ -115,9 +101,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = samax_kernel_64(n1, x); maxf = samax_kernel_64(n1, x);
i = n1; i = n1;
} } else {
else
{
maxf = ABS(x[0]); maxf = ABS(x[0]);
i++; i++;
} }
@ -156,7 +140,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (ABS(x[i]) > maxf) { if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]); maxf = ABS(x[i]);

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,40 +28,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif
static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) {
{
FLOAT amin; FLOAT amin;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],6\n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%2) \n\t" "vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%2) \n\t" "vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%2) \n\t" "vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%2) \n\t" "vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%2) \n\t" "vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vl %%v31,240(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfminsb %%v16,%%v16,%%v24,8\n\t" "vfminsb %%v16,%%v16,%%v24,8\n\t"
"vfminsb %%v17,%%v17,%%v25,8\n\t" "vfminsb %%v17,%%v17,%%v25,8\n\t"
"vfminsb %%v18,%%v18,%%v26,8\n\t" "vfminsb %%v18,%%v18,%%v26,8\n\t"
@ -70,32 +62,25 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x)
"vfminsb %%v21,%%v21,%%v29,8\n\t" "vfminsb %%v21,%%v21,%%v29,8\n\t"
"vfminsb %%v22,%%v22,%%v30,8\n\t" "vfminsb %%v22,%%v22,%%v30,8\n\t"
"vfminsb %%v23,%%v23,%%v31,8\n\t" "vfminsb %%v23,%%v23,%%v31,8\n\t"
"vfminsb %%v16,%%v16,%%v20,8\n\t" "vfminsb %%v16,%%v16,%%v20,8\n\t"
"vfminsb %%v17,%%v17,%%v21,8\n\t" "vfminsb %%v17,%%v17,%%v21,8\n\t"
"vfminsb %%v18,%%v18,%%v22,8\n\t" "vfminsb %%v18,%%v18,%%v22,8\n\t"
"vfminsb %%v19,%%v19,%%v23,8\n\t" "vfminsb %%v19,%%v19,%%v23,8\n\t"
"vfminsb %%v16,%%v16,%%v18,8\n\t" "vfminsb %%v16,%%v16,%%v18,8\n\t"
"vfminsb %%v17,%%v17,%%v19,8\n\t" "vfminsb %%v17,%%v17,%%v19,8\n\t"
"vfminsb %%v16,%%v16,%%v17,8\n\t" "vfminsb %%v16,%%v16,%%v17,8\n\t"
"vfminsb %%v0,%%v0,%%v16,8\n\t" "vfminsb %%v0,%%v0,%%v16,8\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t" "veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,8\n\t" "vfminsb %%v0,%%v0,%%v16,8\n\t"
"vrepf %%v16,%%v0,2\n\t" "vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,8\n\t" "wfminsb %%v0,%%v0,%%v16,8\n\t"
"lper %0,%%f0 " "lper %[amin],%%f0"
:"=f"(amin) : [amin] "=f"(amin),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amin; return amin;
} }
@ -105,7 +90,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
@ -115,9 +101,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = samin_kernel_64(n1, x); minf = samin_kernel_64(n1, x);
i = n1; i = n1;
} } else {
else
{
minf = ABS(x[0]); minf = ABS(x[0]);
i++; i++;
} }
@ -156,7 +140,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (ABS(x[i]) < minf) { if (ABS(x[i]) < minf) {
minf = ABS(x[i]); minf = ABS(x[i]);

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,34 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf #define ABS fabsf
#endif
static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) {
{
FLOAT asum; FLOAT asum;
__asm__ ( __asm__("vzero %%v24\n\t"
"vzero %%v0 \n\t" "vzero %%v25\n\t"
"vzero %%v1 \n\t" "vzero %%v26\n\t"
"vzero %%v2 \n\t" "vzero %%v27\n\t"
"vzero %%v3 \n\t" "vzero %%v28\n\t"
"srlg %%r0,%1,6 \n\t" "vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%2) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%2) \n\t" "vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%2) \n\t" "vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%2) \n\t" "vl %%v23, 112(%%r1,%[x])\n\t"
"vflpsb %%v16, %%v16\n\t" "vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t" "vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t" "vflpsb %%v18, %%v18\n\t"
@ -64,25 +61,22 @@ static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
"vflpsb %%v21, %%v21\n\t" "vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t" "vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t" "vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16 \n\t" "vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v1,%%v1,%%v17 \n\t" "vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v2,%%v2,%%v18 \n\t" "vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v3,%%v3,%%v19 \n\t" "vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v0,%%v0,%%v20 \n\t" "vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v1,%%v1,%%v21 \n\t" "vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v2,%%v2,%%v22 \n\t" "vfasb %%v31,%%v31,%%v23\n\t"
"vfasb %%v3,%%v3,%%v23 \n\t" "vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%2) \n\t" "vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%2) \n\t" "vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%2) \n\t" "vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%2) \n\t" "vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%2) \n\t" "vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%2) \n\t" "vl %%v23, 240(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16\n\t" "vflpsb %%v16, %%v16\n\t"
"vflpsb %%v17, %%v17\n\t" "vflpsb %%v17, %%v17\n\t"
"vflpsb %%v18, %%v18\n\t" "vflpsb %%v18, %%v18\n\t"
@ -91,30 +85,32 @@ static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
"vflpsb %%v21, %%v21\n\t" "vflpsb %%v21, %%v21\n\t"
"vflpsb %%v22, %%v22\n\t" "vflpsb %%v22, %%v22\n\t"
"vflpsb %%v23, %%v23\n\t" "vflpsb %%v23, %%v23\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v0,%%v0,%%v16 \n\t" "vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v1,%%v1,%%v17 \n\t" "vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v2,%%v2,%%v18 \n\t" "vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v3,%%v3,%%v19 \n\t" "vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v0,%%v0,%%v20 \n\t" "vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v1,%%v1,%%v21 \n\t" "vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v2,%%v2,%%v22 \n\t" "vfasb %%v31,%%v31,%%v23\n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfasb %%v0,%%v0,%%v1 \n\t" "vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v0,%%v0,%%v2 \n\t" "vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v0,%%v0,%%v3 \n\t" "vfasb %%v24,%%v24,%%v27\n\t"
"veslg %%v1,%%v0,32 \n\t" "vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v0,%%v0,%%v1 \n\t" "vfasb %%v24,%%v24,%%v29\n\t"
"vrepf %%v1,%%v0,2 \n\t" "vfasb %%v24,%%v24,%%v30\n\t"
"aebr %%f0,%%f1 \n\t" "vfasb %%v24,%%v24,%%v31\n\t"
"ler %0,%%f0 " "veslg %%v25,%%v24,32\n\t"
:"=f"(asum) "vfasb %%v24,%%v24,%%v25\n\t"
:"r"(n),"ZR"((const FLOAT (*)[n])x) "vrepf %%v25,%%v24,2\n\t"
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" "vfasb %%v24,%%v24,%%v25\n\t"
); "vstef %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum; return asum;
} }
@ -125,7 +121,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
BLASLONG n1; BLASLONG n1;
if (n <= 0 || inc_x <= 0) return sumf; if (n <= 0 || inc_x <= 0)
return sumf;
if (inc_x == 1) { if (inc_x == 1) {
@ -166,9 +163,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
j++; j++;
} }
} }
return sumf; return sumf;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,107 +27,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
{ __asm__("vlrepf %%v0,%[alpha]\n\t"
__asm__ volatile( "srlg %[n],%[n],6\n\t"
"vlrepf %%v0,%3 \n\t"
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%1) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%1) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%1) \n\t" "vl %%v20,0(%%r1,%[y])\n\t"
"vl %%v20,0(%%r1,%2) \n\t" "vl %%v21,16(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%2) \n\t" "vl %%v22,32(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%2) \n\t" "vl %%v23,48(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%2) \n\t" "vl %%v24,64(%%r1,%[x])\n\t"
"vl %%v25,80(%%r1,%[x])\n\t"
"vl %%v26,96(%%r1,%[x])\n\t"
"vl %%v27,112(%%r1,%[x])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t" "vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t" "vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t" "vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t" "vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
"vl %%v24,64(%%r1,%1) \n\t" "vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
"vl %%v25,80(%%r1,%1) \n\t" "vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
"vl %%v26,96(%%r1,%1) \n\t" "vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
"vl %%v27,112(%%r1,%1) \n\t" "vst %%v16,0(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%2) \n\t" "vst %%v17,16(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%2) \n\t" "vst %%v18,32(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%2) \n\t" "vst %%v19,48(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%2) \n\t" "vst %%v24,64(%%r1,%[y])\n\t"
"vst %%v25,80(%%r1,%[y])\n\t"
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" "vst %%v26,96(%%r1,%[y])\n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" "vst %%v27,112(%%r1,%[y])\n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" "vl %%v16,128(%%r1,%[x])\n\t"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" "vl %%v17,144(%%r1,%[x])\n\t"
"vl %%v18,160(%%r1,%[x])\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vl %%v19,176(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vl %%v20,128(%%r1,%[y])\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vl %%v21,144(%%r1,%[y])\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "vl %%v22,160(%%r1,%[y])\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "vl %%v23,176(%%r1,%[y])\n\t"
"vst %%v21,80(%%r1,%2) \n\t" "vl %%v24,192(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%2) \n\t" "vl %%v25,208(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%2) \n\t" "vl %%v26,224(%%r1,%[x])\n\t"
"vl %%v27,240(%%r1,%[x])\n\t"
"vl %%v16,128(%%r1,%1) \n\t" "vl %%v28,192(%%r1,%[y])\n\t"
"vl %%v17,144(%%r1,%1) \n\t" "vl %%v29,208(%%r1,%[y])\n\t"
"vl %%v18,160(%%r1,%1) \n\t" "vl %%v30,224(%%r1,%[y])\n\t"
"vl %%v19,176(%%r1,%1) \n\t" "vl %%v31,240(%%r1,%[y])\n\t"
"vl %%v20,128(%%r1,%2) \n\t"
"vl %%v21,144(%%r1,%2) \n\t"
"vl %%v22,160(%%r1,%2) \n\t"
"vl %%v23,176(%%r1,%2) \n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t" "vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t" "vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t" "vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t" "vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
"vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
"vl %%v24,192(%%r1,%1) \n\t" "vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
"vl %%v25,208(%%r1,%1) \n\t" "vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
"vl %%v26,224(%%r1,%1) \n\t" "vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
"vl %%v27,240(%%r1,%1) \n\t" "vst %%v16,128(%%r1,%[y])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vst %%v17,144(%%r1,%[y])\n\t"
"vl %%v29,208(%%r1,%2) \n\t" "vst %%v18,160(%%r1,%[y])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vst %%v19,176(%%r1,%[y])\n\t"
"vl %%v31,240(%%r1,%2) \n\t" "vst %%v24,192(%%r1,%[y])\n\t"
"vst %%v25,208(%%r1,%[y])\n\t"
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" "vst %%v26,224(%%r1,%[y])\n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" "vst %%v27,240(%%r1,%[y])\n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,128(%%r1,%2) \n\t"
"vst %%v17,144(%%r1,%2) \n\t"
"vst %%v18,160(%%r1,%2) \n\t"
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" [alpha] "Q"(*alpha)
); : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
{ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
if ( n <= 0 ) return 0 ; if (n <= 0)
return 0;
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
@ -135,8 +124,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
saxpy_kernel_64(n1, x, y, &da); saxpy_kernel_64(n1, x, y, &da);
i = n1; i = n1;
while(i < n) while (i < n) {
{
y[i] += da * x[i]; y[i] += da * x[i];
i++; i++;
@ -144,13 +132,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
} }
return 0; return 0;
} }
BLASLONG n1 = n & -4; BLASLONG n1 = n & -4;
while(i < n1) while (i < n1) {
{
FLOAT m1 = da * x[ix]; FLOAT m1 = da * x[ix];
FLOAT m2 = da * x[ix + inc_x]; FLOAT m2 = da * x[ix + inc_x];
@ -168,8 +154,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
} }
while(i < n) while (i < n) {
{
y[iy] += da * x[ix]; y[iy] += da * x[ix];
ix += inc_x; ix += inc_x;
@ -180,5 +165,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
return 0; return 0;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,30 +27,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],6\n\t"
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,6 \n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1) \n\t" "pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%%r2) \n\t" "pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t" "mvc 0(256,%[y]),0(%[x])\n\t"
"agfi %%r1,256 \n\t" "la %[x],256(%[x])\n\t"
"agfi %%r2,256 \n\t" "la %[y],256(%[y])\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) : "m"(*(const struct { FLOAT x[n]; } *) x)
:"memory","cc","r0","r1","r2" : "cc");
);
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
if (n <= 0) return 0; if (n <= 0)
return 0;
if ((inc_x == 1) && (inc_y == 1)) { if ((inc_x == 1) && (inc_y == 1)) {
@ -66,7 +62,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
} }
} else { } else {
while (i < n) { while (i < n) {
@ -81,5 +76,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
} }
return 0; return 0;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project Copyright (c) 2013-2019,The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms,with or without Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are modification,are permitted provided that the following conditions are
@ -27,72 +27,82 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
{
FLOAT dot; FLOAT dot;
__asm__ volatile ( __asm__("vzero %%v0\n\t"
"vzero %%v0 \n\t" "vzero %%v1\n\t"
"srlg %%r0,%1,5 \n\t" "vzero %%v2\n\t"
"vzero %%v3\n\t"
"vzero %%v4\n\t"
"vzero %%v5\n\t"
"vzero %%v6\n\t"
"vzero %%v7\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[x])\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "pfd 1,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v24,0(%%r1,%3) \n\t" "vl %%v26,32(%%r1,%[y])\n\t"
"vl %%v27,48(%%r1,%[y])\n\t"
"vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v30,96(%%r1,%[y])\n\t"
"vl %%v31,112(%%r1,%[y])\n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t" "vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
"vl %%v25,16(%%r1,%3) \n\t" "vfmasb %%v1,%%v17,%%v25,%%v1\n\t"
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" "vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
"vl %%v26,32(%%r1,%3) \n\t" "vfmasb %%v3,%%v19,%%v27,%%v3\n\t"
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" "vfmasb %%v4,%%v20,%%v28,%%v4\n\t"
"vl %%v27,48(%%r1,%3) \n\t" "vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" "vfmasb %%v6,%%v22,%%v30,%%v6\n\t"
"vl %%v28,64(%%r1,%3) \n\t" "vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfasb %%v0,%%v0,%%v1\n\t"
"vfasb %%v0,%%v0,%%v2\n\t"
"vfasb %%v0,%%v0,%%v3\n\t"
"vfasb %%v0,%%v0,%%v4\n\t"
"vfasb %%v0,%%v0,%%v5\n\t"
"vfasb %%v0,%%v0,%%v6\n\t"
"vfasb %%v0,%%v0,%%v7\n\t"
"vrepf %%v1,%%v0,1\n\t" "vrepf %%v1,%%v0,1\n\t"
"vrepf %%v2,%%v0,2\n\t" "vrepf %%v2,%%v0,2\n\t"
"vrepf %%v3,%%v0,3\n\t" "vrepf %%v3,%%v0,3\n\t"
"aebr %%f0,%%f1\n\t" "aebr %%f0,%%f1\n\t"
"aebr %%f0,%%f2\n\t" "aebr %%f0,%%f2\n\t"
"aebr %%f0,%%f3\n\t" "aebr %%f0,%%f3\n\t"
"ler %0,%%f0 " "ler %[dot],%%f0"
:"=f"(dot) : [dot] "=f"(dot),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
); : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
return dot; return dot;
} }
FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
{
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
FLOAT dot = 0.0; FLOAT dot = 0.0;
if ( n <= 0 ) return(dot); if (n <= 0)
return (dot);
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
@ -100,8 +110,7 @@ FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
dot = sdot_kernel_32(n1, x, y); dot = sdot_kernel_32(n1, x, y);
i = n1; i = n1;
while(i < n) while (i < n) {
{
dot += y[i] * x[i]; dot += y[i] * x[i];
i++; i++;
@ -109,13 +118,11 @@ FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
} }
return (dot); return (dot);
} }
BLASLONG n1 = n & -2; BLASLONG n1 = n & -2;
while(i < n1) while (i < n1) {
{
dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x]; dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x];
ix += inc_x * 2; ix += inc_x * 2;
@ -124,8 +131,7 @@ FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
} }
while(i < n) while (i < n) {
{
dot += y[iy] * x[ix]; dot += y[iy] * x[ix];
ix += inc_x; ix += inc_x;
@ -136,5 +142,3 @@ FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
return (dot); return (dot);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -29,364 +29,329 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NBMAX 2048 #define NBMAX 2048
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
{ FLOAT *alpha) {
__asm__ volatile ( register FLOAT *ap0 = ap[0];
"vlrepf %%v0,0(%5) \n\t" register FLOAT *ap1 = ap[1];
"vlrepf %%v1,4(%5) \n\t" register FLOAT *ap2 = ap[2];
"vlrepf %%v2,8(%5) \n\t" register FLOAT *ap3 = ap[3];
"vlrepf %%v3,12(%5) \n\t"
"vlrepf %%v4,%7 \n\t" __asm__("vlrepf %%v0,0(%[x])\n\t"
"vlrepf %%v1,4(%[x])\n\t"
"vlrepf %%v2,8(%[x])\n\t"
"vlrepf %%v3,12(%[x])\n\t"
"vlrepf %%v4,%[alpha]\n\t"
"vfmsb %%v0,%%v0,%%v4\n\t" "vfmsb %%v0,%%v0,%%v4\n\t"
"vfmsb %%v1,%%v1,%%v4\n\t" "vfmsb %%v1,%%v1,%%v4\n\t"
"vfmsb %%v2,%%v2,%%v4\n\t" "vfmsb %%v2,%%v2,%%v4\n\t"
"vfmsb %%v3,%%v3,%%v4\n\t" "vfmsb %%v3,%%v3,%%v4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t" "lghi %%r0,-32\n\t"
"ngr %%r0,%0 \n\t" "ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t" "ltgr %%r0,%%r0\n\t"
"jz 1f\n\t" "jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t" "srlg %%r0,%%r0,5\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%4) \n\t" "pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%6) \n\t" "pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v17,0(%%r1,%2) \n\t" "vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v18,0(%%r1,%3) \n\t" "vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v19,0(%%r1,%4) \n\t" "vl %%v20,16(%%r1,%[ap0])\n\t"
"vl %%v20,16(%%r1,%1) \n\t" "vl %%v21,16(%%r1,%[ap1])\n\t"
"vl %%v21,16(%%r1,%2) \n\t" "vl %%v22,16(%%r1,%[ap2])\n\t"
"vl %%v22,16(%%r1,%3) \n\t" "vl %%v23,16(%%r1,%[ap3])\n\t"
"vl %%v23,16(%%r1,%4) \n\t" "vl %%v24,32(%%r1,%[ap0])\n\t"
"vl %%v24,32(%%r1,%1) \n\t" "vl %%v25,32(%%r1,%[ap1])\n\t"
"vl %%v25,32(%%r1,%2) \n\t" "vl %%v26,32(%%r1,%[ap2])\n\t"
"vl %%v26,32(%%r1,%3) \n\t" "vl %%v27,32(%%r1,%[ap3])\n\t"
"vl %%v27,32(%%r1,%4) \n\t" "vl %%v28,48(%%r1,%[ap0])\n\t"
"vl %%v28,48(%%r1,%1) \n\t" "vl %%v29,48(%%r1,%[ap1])\n\t"
"vl %%v29,48(%%r1,%2) \n\t" "vl %%v30,48(%%r1,%[ap2])\n\t"
"vl %%v30,48(%%r1,%3) \n\t" "vl %%v31,48(%%r1,%[ap3])\n\t"
"vl %%v31,48(%%r1,%4) \n\t" "vl %%v4,0(%%r1,%[y])\n\t"
"vl %%v5,16(%%r1,%[y])\n\t"
"vl %%v4,0(%%r1,%6) \n\t" "vl %%v6,32(%%r1,%[y])\n\t"
"vl %%v7,48(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t" "vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmasb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmasb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t" "vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmasb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmasb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t" "vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmasb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmasb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmasb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t" "vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,0(%%r1,%6) \n\t" "vfmasb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmasb %%v6,%%v27,%%v3,%%v6\n\t"
"vl %%v4,16(%%r1,%6) \n\t" "vfmasb %%v7,%%v31,%%v3,%%v7\n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" "vst %%v4,0(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" "vst %%v5,16(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" "vst %%v6,32(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" "vst %%v7,48(%%r1,%[y])\n\t"
"vst %%v4,16(%%r1,%6) \n\t" "vl %%v16,64(%%r1,%[ap0])\n\t"
"vl %%v17,64(%%r1,%[ap1])\n\t"
"vl %%v4,32(%%r1,%6) \n\t" "vl %%v18,64(%%r1,%[ap2])\n\t"
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" "vl %%v19,64(%%r1,%[ap3])\n\t"
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" "vl %%v20,80(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" "vl %%v21,80(%%r1,%[ap1])\n\t"
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" "vl %%v22,80(%%r1,%[ap2])\n\t"
"vst %%v4,32(%%r1,%6) \n\t" "vl %%v23,80(%%r1,%[ap3])\n\t"
"vl %%v24,96(%%r1,%[ap0])\n\t"
"vl %%v4,48(%%r1,%6) \n\t" "vl %%v25,96(%%r1,%[ap1])\n\t"
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" "vl %%v26,96(%%r1,%[ap2])\n\t"
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" "vl %%v27,96(%%r1,%[ap3])\n\t"
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" "vl %%v28,112(%%r1,%[ap0])\n\t"
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" "vl %%v29,112(%%r1,%[ap1])\n\t"
"vst %%v4,48(%%r1,%6) \n\t" "vl %%v30,112(%%r1,%[ap2])\n\t"
"vl %%v31,112(%%r1,%[ap3])\n\t"
"vl %%v16,64(%%r1,%1) \n\t" "vl %%v4,64(%%r1,%[y])\n\t"
"vl %%v17,64(%%r1,%2) \n\t" "vl %%v5,80(%%r1,%[y])\n\t"
"vl %%v18,64(%%r1,%3) \n\t" "vl %%v6,96(%%r1,%[y])\n\t"
"vl %%v19,64(%%r1,%4) \n\t" "vl %%v7,112(%%r1,%[y])\n\t"
"vl %%v20,80(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,80(%%r1,%3) \n\t"
"vl %%v23,80(%%r1,%4) \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
"vl %%v4,64(%%r1,%6) \n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t" "vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmasb %%v5,%%v20,%%v0,%%v5\n\t"
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmasb %%v7,%%v28,%%v0,%%v7\n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t" "vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmasb %%v5,%%v21,%%v1,%%v5\n\t"
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmasb %%v7,%%v29,%%v1,%%v7\n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t" "vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmasb %%v5,%%v22,%%v2,%%v5\n\t"
"vfmasb %%v6,%%v26,%%v2,%%v6\n\t"
"vfmasb %%v7,%%v30,%%v2,%%v7\n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t" "vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,64(%%r1,%6) \n\t" "vfmasb %%v5,%%v23,%%v3,%%v5\n\t"
"vfmasb %%v6,%%v27,%%v3,%%v6\n\t"
"vl %%v4,80(%%r1,%6) \n\t" "vfmasb %%v7,%%v31,%%v3,%%v7\n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" "vst %%v4,64(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" "vst %%v5,80(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" "vst %%v6,96(%%r1,%[y])\n\t"
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" "vst %%v7,112(%%r1,%[y])\n\t"
"vst %%v4,80(%%r1,%6) \n\t"
"vl %%v4,96(%%r1,%6) \n\t"
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,96(%%r1,%6) \n\t"
"vl %%v4,112(%%r1,%6) \n\t"
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,112(%%r1,%6) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t" "brctg %%r0,0b\n\t"
"1:\n\t" "1:\n\t"
"lghi %%r0,28\n\t" "lghi %%r0,28\n\t"
"ngr %%r0,%0 \n\t" "ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t" "ltgr %%r0,%%r0\n\t"
"jz 3f\n\t" "jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t" "srlg %%r0,%%r0,2\n\t"
"2:\n\t" "2:\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%2) \n\t" "vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v18,0(%%r1,%3) \n\t" "vl %%v18,0(%%r1,%[ap2])\n\t"
"vl %%v19,0(%%r1,%4) \n\t" "vl %%v19,0(%%r1,%[ap3])\n\t"
"vl %%v4,0(%%r1,%[y])\n\t"
"vl %%v4,0(%%r1,%6) \n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t" "vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t" "vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t" "vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t" "vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
"vst %%v4,0(%%r1,%6) \n\t" "vst %%v4,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t" "agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t" "brctg %%r0,2b\n\t"
"3:\n\t" "3:\n\t"
"nop" "nop"
: : "+m"(*(struct { FLOAT x[n]; } *) y)
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
); "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28", "v29", "v30", "v31");
} }
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
{ FLOAT *alpha) {
__asm__ volatile ( register FLOAT *ap0 = ap[0];
"vlrepf %%v0,0(%3) \n\t" register FLOAT *ap1 = ap[1];
"vlrepf %%v1,4(%3) \n\t"
"vlrepf %%v2,%5 \n\t" __asm__("vlrepf %%v0,0(%[x])\n\t"
"vlrepf %%v1,4(%[x])\n\t"
"vlrepf %%v2,%[alpha]\n\t"
"vfmsb %%v0,%%v0,%%v2\n\t" "vfmsb %%v0,%%v0,%%v2\n\t"
"vfmsb %%v1,%%v1,%%v2\n\t" "vfmsb %%v1,%%v1,%%v2\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t" "lghi %%r0,-32\n\t"
"ngr %%r0,%0 \n\t" "ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t" "ltgr %%r0,%%r0\n\t"
"jz 1f\n\t" "jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t" "srlg %%r0,%%r0,5\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%4) \n\t" "pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v17,0(%%r1,%2) \n\t" "vl %%v18,16(%%r1,%[ap0])\n\t"
"vl %%v18,16(%%r1,%1) \n\t" "vl %%v19,16(%%r1,%[ap1])\n\t"
"vl %%v19,16(%%r1,%2) \n\t" "vl %%v20,32(%%r1,%[ap0])\n\t"
"vl %%v20,32(%%r1,%1) \n\t" "vl %%v21,32(%%r1,%[ap1])\n\t"
"vl %%v21,32(%%r1,%2) \n\t" "vl %%v22,48(%%r1,%[ap0])\n\t"
"vl %%v22,48(%%r1,%1) \n\t" "vl %%v23,48(%%r1,%[ap1])\n\t"
"vl %%v23,48(%%r1,%2) \n\t" "vl %%v24,64(%%r1,%[ap0])\n\t"
"vl %%v24,64(%%r1,%1) \n\t" "vl %%v25,64(%%r1,%[ap1])\n\t"
"vl %%v25,64(%%r1,%2) \n\t" "vl %%v26,80(%%r1,%[ap0])\n\t"
"vl %%v26,80(%%r1,%1) \n\t" "vl %%v27,80(%%r1,%[ap1])\n\t"
"vl %%v27,80(%%r1,%2) \n\t" "vl %%v28,96(%%r1,%[ap0])\n\t"
"vl %%v28,96(%%r1,%1) \n\t" "vl %%v29,96(%%r1,%[ap1])\n\t"
"vl %%v29,96(%%r1,%2) \n\t" "vl %%v30,112(%%r1,%[ap0])\n\t"
"vl %%v30,112(%%r1,%1) \n\t" "vl %%v31,112(%%r1,%[ap1])\n\t"
"vl %%v31,112(%%r1,%2) \n\t" "vl %%v2,0(%%r1,%[y])\n\t"
"vl %%v3,16(%%r1,%[y])\n\t"
"vl %%v2,0(%%r1,%4) \n\t" "vl %%v4,32(%%r1,%[y])\n\t"
"vl %%v5,48(%%r1,%[y])\n\t"
"vl %%v6,64(%%r1,%[y])\n\t"
"vl %%v7,80(%%r1,%[y])\n\t"
"vl %%v8,96(%%r1,%[y])\n\t"
"vl %%v9,112(%%r1,%[y])\n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2\n\t" "vfmasb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmasb %%v3,%%v18,%%v0,%%v3\n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4\n\t"
"vfmasb %%v5,%%v22,%%v0,%%v5\n\t"
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
"vfmasb %%v7,%%v26,%%v0,%%v7\n\t"
"vfmasb %%v8,%%v28,%%v0,%%v8\n\t"
"vfmasb %%v9,%%v30,%%v0,%%v9\n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2\n\t" "vfmasb %%v2,%%v17,%%v1,%%v2\n\t"
"vst %%v2,0(%%r1,%4) \n\t" "vfmasb %%v3,%%v19,%%v1,%%v3\n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4\n\t"
"vl %%v2,16(%%r1,%4) \n\t" "vfmasb %%v5,%%v23,%%v1,%%v5\n\t"
"vfmasb %%v2,%%v18,%%v0,%%v2 \n\t" "vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
"vfmasb %%v2,%%v19,%%v1,%%v2 \n\t" "vfmasb %%v7,%%v27,%%v1,%%v7\n\t"
"vst %%v2,16(%%r1,%4) \n\t" "vfmasb %%v8,%%v29,%%v1,%%v8\n\t"
"vfmasb %%v9,%%v31,%%v1,%%v9\n\t"
"vl %%v2,32(%%r1,%4) \n\t" "vst %%v2,0(%%r1,%[y])\n\t"
"vfmasb %%v2,%%v20,%%v0,%%v2 \n\t" "vst %%v3,16(%%r1,%[y])\n\t"
"vfmasb %%v2,%%v21,%%v1,%%v2 \n\t" "vst %%v4,32(%%r1,%[y])\n\t"
"vst %%v2,32(%%r1,%4) \n\t" "vst %%v5,48(%%r1,%[y])\n\t"
"vst %%v6,64(%%r1,%[y])\n\t"
"vl %%v2,48(%%r1,%4) \n\t" "vst %%v7,80(%%r1,%[y])\n\t"
"vfmasb %%v2,%%v22,%%v0,%%v2 \n\t" "vst %%v8,96(%%r1,%[y])\n\t"
"vfmasb %%v2,%%v23,%%v1,%%v2 \n\t" "vst %%v9,112(%%r1,%[y])\n\t"
"vst %%v2,48(%%r1,%4) \n\t"
"vl %%v2,64(%%r1,%4) \n\t"
"vfmasb %%v2,%%v24,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v25,%%v1,%%v2 \n\t"
"vst %%v2,64(%%r1,%4) \n\t"
"vl %%v2,80(%%r1,%4) \n\t"
"vfmasb %%v2,%%v26,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v27,%%v1,%%v2 \n\t"
"vst %%v2,80(%%r1,%4) \n\t"
"vl %%v2,96(%%r1,%4) \n\t"
"vfmasb %%v2,%%v28,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v29,%%v1,%%v2 \n\t"
"vst %%v2,96(%%r1,%4) \n\t"
"vl %%v2,112(%%r1,%4) \n\t"
"vfmasb %%v2,%%v30,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v31,%%v1,%%v2 \n\t"
"vst %%v2,112(%%r1,%4) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t" "brctg %%r0,0b\n\t"
"1:\n\t" "1:\n\t"
"lghi %%r0,28\n\t" "lghi %%r0,28\n\t"
"ngr %%r0,%0 \n\t" "ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t" "ltgr %%r0,%%r0\n\t"
"jz 3f\n\t" "jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t" "srlg %%r0,%%r0,2\n\t"
"2:\n\t" "2:\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v16,0(%%r1,%[ap0])\n\t"
"vl %%v17,0(%%r1,%2) \n\t" "vl %%v17,0(%%r1,%[ap1])\n\t"
"vl %%v2,0(%%r1,%[y])\n\t"
"vl %%v2,0(%%r1,%4) \n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2\n\t" "vfmasb %%v2,%%v16,%%v0,%%v2\n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2\n\t" "vfmasb %%v2,%%v17,%%v1,%%v2\n\t"
"vst %%v2,0(%%r1,%4) \n\t" "vst %%v2,0(%%r1,%[y])\n\t"
"agfi %%r1,16\n\t" "agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t" "brctg %%r0,2b\n\t"
"3:\n\t" "3:\n\t"
"nop" "nop"
: : "+m"(*(struct { FLOAT x[n]; } *) y)
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
); "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
[n] "r"(n)
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
} }
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y,
{ FLOAT *alpha) {
__asm__ volatile ( __asm__("vlrepf %%v0,0(%[x])\n\t"
"vlrepf %%v0,0(%2) \n\t" "vlrepf %%v16,%[alpha]\n\t"
"vlrepf %%v1,%4 \n\t" "vfmsb %%v0,%%v0,%%v16\n\t"
"vfmsb %%v0,%%v0,%%v1 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"lghi %%r0,-32\n\t" "lghi %%r0,-32\n\t"
"ngr %%r0,%0 \n\t" "ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t" "ltgr %%r0,%%r0\n\t"
"jz 1f\n\t" "jz 1f\n\t"
"srlg %%r0,%%r0,5\n\t" "srlg %%r0,%%r0,5\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[a0])\n\t"
"pfd 2,1024(%%r1,%3) \n\t" "pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v17,16(%%r1,%[a0])\n\t"
"vl %%v17,16(%%r1,%1) \n\t" "vl %%v18,32(%%r1,%[a0])\n\t"
"vl %%v18,32(%%r1,%1) \n\t" "vl %%v19,48(%%r1,%[a0])\n\t"
"vl %%v19,48(%%r1,%1) \n\t" "vl %%v20,64(%%r1,%[a0])\n\t"
"vl %%v20,64(%%r1,%1) \n\t" "vl %%v21,80(%%r1,%[a0])\n\t"
"vl %%v21,80(%%r1,%1) \n\t" "vl %%v22,96(%%r1,%[a0])\n\t"
"vl %%v22,96(%%r1,%1) \n\t" "vl %%v23,112(%%r1,%[a0])\n\t"
"vl %%v23,112(%%r1,%1) \n\t" "vl %%v24,0(%%r1,%[y])\n\t"
"vl %%v25,16(%%r1,%[y])\n\t"
"vl %%v1,0(%%r1,%3) \n\t" "vl %%v26,32(%%r1,%[y])\n\t"
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" "vl %%v27,48(%%r1,%[y])\n\t"
"vst %%v1,0(%%r1,%3) \n\t" "vl %%v28,64(%%r1,%[y])\n\t"
"vl %%v29,80(%%r1,%[y])\n\t"
"vl %%v1,16(%%r1,%3) \n\t" "vl %%v30,96(%%r1,%[y])\n\t"
"vfmasb %%v1,%%v17,%%v0,%%v1 \n\t" "vl %%v31,112(%%r1,%[y])\n\t"
"vst %%v1,16(%%r1,%3) \n\t" "vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmasb %%v25,%%v17,%%v0,%%v25\n\t"
"vl %%v1,32(%%r1,%3) \n\t" "vfmasb %%v26,%%v18,%%v0,%%v26\n\t"
"vfmasb %%v1,%%v18,%%v0,%%v1 \n\t" "vfmasb %%v27,%%v19,%%v0,%%v27\n\t"
"vst %%v1,32(%%r1,%3) \n\t" "vfmasb %%v28,%%v20,%%v0,%%v28\n\t"
"vfmasb %%v29,%%v21,%%v0,%%v29\n\t"
"vl %%v1,48(%%r1,%3) \n\t" "vfmasb %%v30,%%v22,%%v0,%%v30\n\t"
"vfmasb %%v1,%%v19,%%v0,%%v1 \n\t" "vfmasb %%v31,%%v23,%%v0,%%v31\n\t"
"vst %%v1,48(%%r1,%3) \n\t" "vst %%v24,0(%%r1,%[y])\n\t"
"vst %%v25,16(%%r1,%[y])\n\t"
"vl %%v1,64(%%r1,%3) \n\t" "vst %%v26,32(%%r1,%[y])\n\t"
"vfmasb %%v1,%%v20,%%v0,%%v1 \n\t" "vst %%v27,48(%%r1,%[y])\n\t"
"vst %%v1,64(%%r1,%3) \n\t" "vst %%v28,64(%%r1,%[y])\n\t"
"vst %%v29,80(%%r1,%[y])\n\t"
"vl %%v1,80(%%r1,%3) \n\t" "vst %%v30,96(%%r1,%[y])\n\t"
"vfmasb %%v1,%%v21,%%v0,%%v1 \n\t" "vst %%v31,112(%%r1,%[y])\n\t"
"vst %%v1,80(%%r1,%3) \n\t"
"vl %%v1,96(%%r1,%3) \n\t"
"vfmasb %%v1,%%v22,%%v0,%%v1 \n\t"
"vst %%v1,96(%%r1,%3) \n\t"
"vl %%v1,112(%%r1,%3) \n\t"
"vfmasb %%v1,%%v23,%%v0,%%v1 \n\t"
"vst %%v1,112(%%r1,%3) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b\n\t" "brctg %%r0,0b\n\t"
"1:\n\t" "1:\n\t"
"lghi %%r0,28\n\t" "lghi %%r0,28\n\t"
"ngr %%r0,%0 \n\t" "ngr %%r0,%[n]\n\t"
"ltgr %%r0,%%r0\n\t" "ltgr %%r0,%%r0\n\t"
"jz 3f\n\t" "jz 3f\n\t"
"srlg %%r0,%%r0,2\n\t" "srlg %%r0,%%r0,2\n\t"
"2:\n\t" "2:\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v16,0(%%r1,%[a0])\n\t"
"vl %%v17,0(%%r1,%[y])\n\t"
"vl %%v1,0(%%r1,%3) \n\t" "vfmasb %%v17,%%v16,%%v0,%%v17\n\t"
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" "vst %%v17,0(%%r1,%[y])\n\t"
"vst %%v1,0(%%r1,%3) \n\t"
"agfi %%r1,16\n\t" "agfi %%r1,16\n\t"
"brctg %%r0,2b\n\t" "brctg %%r0,2b\n\t"
"3:\n\t" "3:\n\t"
"nop" "nop"
: : "+m"(*(struct { FLOAT x[n]; } *) y)
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha),
); [n] "r"(n)
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) {
{
BLASLONG i; BLASLONG i;
for (i = 0; i < n; i++) for (i = 0; i < n; i++) {
{
*dest += src[i]; *dest += src[i];
dest += inc_dest; dest += inc_dest;
} }
} }
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
{ BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *buffer) {
BLASLONG i; BLASLONG i;
FLOAT *a_ptr; FLOAT *a_ptr;
FLOAT *x_ptr; FLOAT *x_ptr;
@ -400,8 +365,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG lda4 = lda << 2; BLASLONG lda4 = lda << 2;
FLOAT xbuffer[8], *ybuffer; FLOAT xbuffer[8], *ybuffer;
if ( m < 1 ) return(0); if (m < 1)
if ( n < 1 ) return(0); return (0);
if (n < 1)
return (0);
ybuffer = buffer; ybuffer = buffer;
@ -416,13 +383,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG NB = NBMAX; BLASLONG NB = NBMAX;
while ( NB == NBMAX ) while (NB == NBMAX) {
{
m1 -= NB; m1 -= NB;
if ( m1 < 0) if (m1 < 0) {
{ if (m2 == 0)
if ( m2 == 0 ) break; break;
NB = m2; NB = m2;
} }
@ -439,12 +405,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
else else
ybuffer = y_ptr; ybuffer = y_ptr;
if ( inc_x == 1 ) if (inc_x == 1) {
{
for (i = 0; i < n1; i++) {
for( i = 0; i < n1 ; i++)
{
sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha);
ap[0] += lda4; ap[0] += lda4;
ap[1] += lda4; ap[1] += lda4;
@ -454,29 +417,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr += 4; x_ptr += 4;
} }
if ( n2 & 2 ) if (n2 & 2) {
{
sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha);
a_ptr += lda * 2; a_ptr += lda * 2;
x_ptr += 2; x_ptr += 2;
} }
if (n2 & 1) {
if ( n2 & 1 )
{
sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha);
/* a_ptr += lda; /* a_ptr += lda;
x_ptr += 1; */ x_ptr += 1; */
} }
} else {
} for (i = 0; i < n1; i++) {
else
{
for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0]; xbuffer[0] = x_ptr[0];
x_ptr += inc_x; x_ptr += inc_x;
xbuffer[1] = x_ptr[0]; xbuffer[1] = x_ptr[0];
@ -493,8 +449,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
a_ptr += lda4; a_ptr += lda4;
} }
for( i = 0; i < n2 ; i++) for (i = 0; i < n2; i++) {
{
xbuffer[0] = x_ptr[0]; xbuffer[0] = x_ptr[0];
x_ptr += inc_x; x_ptr += inc_x;
sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha);
@ -505,30 +460,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
} }
a += NB; a += NB;
if ( inc_y != 1 ) if (inc_y != 1) {
{
add_y(NB, ybuffer, y_ptr, inc_y); add_y(NB, ybuffer, y_ptr, inc_y);
y_ptr += NB * inc_y; y_ptr += NB * inc_y;
} } else
else
y_ptr += NB; y_ptr += NB;
} }
if ( m3 == 0 ) return(0); if (m3 == 0)
return (0);
if ( m3 == 3 ) if (m3 == 3) {
{
a_ptr = a; a_ptr = a;
x_ptr = x; x_ptr = x;
FLOAT temp0 = 0.0; FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0; FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0; FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 ) if (lda == 3 && inc_x == 1) {
{
for( i = 0; i < ( n & -4 ); i+=4 ) for (i = 0; i < (n & -4); i += 4) {
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
@ -542,8 +493,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr += 4; x_ptr += 4;
} }
for( ; i < n; i++ ) for (; i < n; i++) {
{
temp0 += a_ptr[0] * x_ptr[0]; temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0]; temp2 += a_ptr[2] * x_ptr[0];
@ -551,19 +501,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr++; x_ptr++;
} }
} } else {
else
{
for( i = 0; i < n; i++ ) for (i = 0; i < n; i++) {
{
temp0 += a_ptr[0] * x_ptr[0]; temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0]; temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda; a_ptr += lda;
x_ptr += inc_x; x_ptr += inc_x;
} }
} }
@ -575,18 +521,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return (0); return (0);
} }
if (m3 == 2) {
if ( m3 == 2 )
{
a_ptr = a; a_ptr = a;
x_ptr = x; x_ptr = x;
FLOAT temp0 = 0.0; FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0; FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 ) if (lda == 2 && inc_x == 1) {
{
for( i = 0; i < (n & -4) ; i+=4 ) for (i = 0; i < (n & -4); i += 4) {
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
@ -596,27 +538,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
} }
for (; i < n; i++) {
for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0]; temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2; a_ptr += 2;
x_ptr++; x_ptr++;
} }
} } else {
else
{
for( i = 0; i < n; i++ ) for (i = 0; i < n; i++) {
{
temp0 += a_ptr[0] * x_ptr[0]; temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0]; temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda; a_ptr += lda;
x_ptr += inc_x; x_ptr += inc_x;
} }
} }
@ -626,31 +562,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return (0); return (0);
} }
if ( m3 == 1 ) if (m3 == 1) {
{
a_ptr = a; a_ptr = a;
x_ptr = x; x_ptr = x;
FLOAT temp = 0.0; FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 ) if (lda == 1 && inc_x == 1) {
{
for( i = 0; i < (n & -4); i+=4 ) for (i = 0; i < (n & -4); i += 4) {
{ temp +=
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i +
2] *
x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3];
} }
for( ; i < n; i++ ) for (; i < n; i++) {
{
temp += a_ptr[i] * x_ptr[i]; temp += a_ptr[i] * x_ptr[i];
} }
} } else {
else
{
for( i = 0; i < n; i++ ) for (i = 0; i < n; i++) {
{
temp += a_ptr[0] * x_ptr[0]; temp += a_ptr[0] * x_ptr[0];
a_ptr += lda; a_ptr += lda;
x_ptr += inc_x; x_ptr += inc_x;
@ -661,8 +593,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
return (0); return (0);
} }
return (0); return (0);
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,34 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) {
{
FLOAT max; FLOAT max;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],6\n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%2) \n\t" "vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%2) \n\t" "vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%2) \n\t" "vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%2) \n\t" "vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%2) \n\t" "vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vl %%v31,240(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfmaxsb %%v16,%%v16,%%v24,0\n\t" "vfmaxsb %%v16,%%v16,%%v24,0\n\t"
"vfmaxsb %%v17,%%v17,%%v25,0\n\t" "vfmaxsb %%v17,%%v17,%%v25,0\n\t"
"vfmaxsb %%v18,%%v18,%%v26,0\n\t" "vfmaxsb %%v18,%%v18,%%v26,0\n\t"
@ -63,32 +59,25 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x)
"vfmaxsb %%v21,%%v21,%%v29,0\n\t" "vfmaxsb %%v21,%%v21,%%v29,0\n\t"
"vfmaxsb %%v22,%%v22,%%v30,0\n\t" "vfmaxsb %%v22,%%v22,%%v30,0\n\t"
"vfmaxsb %%v23,%%v23,%%v31,0\n\t" "vfmaxsb %%v23,%%v23,%%v31,0\n\t"
"vfmaxsb %%v16,%%v16,%%v20,0\n\t" "vfmaxsb %%v16,%%v16,%%v20,0\n\t"
"vfmaxsb %%v17,%%v17,%%v21,0\n\t" "vfmaxsb %%v17,%%v17,%%v21,0\n\t"
"vfmaxsb %%v18,%%v18,%%v22,0\n\t" "vfmaxsb %%v18,%%v18,%%v22,0\n\t"
"vfmaxsb %%v19,%%v19,%%v23,0\n\t" "vfmaxsb %%v19,%%v19,%%v23,0\n\t"
"vfmaxsb %%v16,%%v16,%%v18,0\n\t" "vfmaxsb %%v16,%%v16,%%v18,0\n\t"
"vfmaxsb %%v17,%%v17,%%v19,0\n\t" "vfmaxsb %%v17,%%v17,%%v19,0\n\t"
"vfmaxsb %%v16,%%v16,%%v17,0\n\t" "vfmaxsb %%v16,%%v16,%%v17,0\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t" "vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t" "veslg %%v16,%%v0,32\n\t"
"vfmaxsb %%v0,%%v0,%%v16,0\n\t" "vfmaxsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t" "vrepf %%v16,%%v0,2\n\t"
"wfmaxsb %%v0,%%v0,%%v16,0\n\t" "wfmaxsb %%v0,%%v0,%%v16,0\n\t"
"ler %0,%%f0 " "ler %[max],%%f0"
:"=f"(max) : [max] "=f"(max),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return max; return max;
} }
@ -98,7 +87,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0; BLASLONG j = 0;
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
@ -108,9 +98,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = smax_kernel_64(n1, x); maxf = smax_kernel_64(n1, x);
i = n1; i = n1;
} } else {
else
{
maxf = x[0]; maxf = x[0];
i++; i++;
} }
@ -149,7 +137,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (x[i] > maxf) { if (x[i] > maxf) {
maxf = x[i]; maxf = x[i];

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,34 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) {
{
FLOAT min; FLOAT min;
__asm__ volatile ( __asm__("vl %%v0,0(%[x])\n\t"
"vl %%v0,0(%2) \n\t" "srlg %[n],%[n],6\n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vl %%v24,128(%%r1,%[x])\n\t"
"vl %%v24,128(%%r1,%2) \n\t" "vl %%v25,144(%%r1,%[x])\n\t"
"vl %%v25,144(%%r1,%2) \n\t" "vl %%v26,160(%%r1,%[x])\n\t"
"vl %%v26,160(%%r1,%2) \n\t" "vl %%v27,176(%%r1,%[x])\n\t"
"vl %%v27,176(%%r1,%2) \n\t" "vl %%v28,192(%%r1,%[x])\n\t"
"vl %%v28,192(%%r1,%2) \n\t" "vl %%v29,208(%%r1,%[x])\n\t"
"vl %%v29,208(%%r1,%2) \n\t" "vl %%v30,224(%%r1,%[x])\n\t"
"vl %%v30,224(%%r1,%2) \n\t" "vl %%v31,240(%%r1,%[x])\n\t"
"vl %%v31,240(%%r1,%2) \n\t"
"vfminsb %%v16,%%v16,%%v24,0\n\t" "vfminsb %%v16,%%v16,%%v24,0\n\t"
"vfminsb %%v17,%%v17,%%v25,0\n\t" "vfminsb %%v17,%%v17,%%v25,0\n\t"
"vfminsb %%v18,%%v18,%%v26,0\n\t" "vfminsb %%v18,%%v18,%%v26,0\n\t"
@ -63,32 +59,25 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x)
"vfminsb %%v21,%%v21,%%v29,0\n\t" "vfminsb %%v21,%%v21,%%v29,0\n\t"
"vfminsb %%v22,%%v22,%%v30,0\n\t" "vfminsb %%v22,%%v22,%%v30,0\n\t"
"vfminsb %%v23,%%v23,%%v31,0\n\t" "vfminsb %%v23,%%v23,%%v31,0\n\t"
"vfminsb %%v16,%%v16,%%v20,0\n\t" "vfminsb %%v16,%%v16,%%v20,0\n\t"
"vfminsb %%v17,%%v17,%%v21,0\n\t" "vfminsb %%v17,%%v17,%%v21,0\n\t"
"vfminsb %%v18,%%v18,%%v22,0\n\t" "vfminsb %%v18,%%v18,%%v22,0\n\t"
"vfminsb %%v19,%%v19,%%v23,0\n\t" "vfminsb %%v19,%%v19,%%v23,0\n\t"
"vfminsb %%v16,%%v16,%%v18,0\n\t" "vfminsb %%v16,%%v16,%%v18,0\n\t"
"vfminsb %%v17,%%v17,%%v19,0\n\t" "vfminsb %%v17,%%v17,%%v19,0\n\t"
"vfminsb %%v16,%%v16,%%v17,0\n\t" "vfminsb %%v16,%%v16,%%v17,0\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t" "vfminsb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"veslg %%v16,%%v0,32\n\t" "veslg %%v16,%%v0,32\n\t"
"vfminsb %%v0,%%v0,%%v16,0\n\t" "vfminsb %%v0,%%v0,%%v16,0\n\t"
"vrepf %%v16,%%v0,2\n\t" "vrepf %%v16,%%v0,2\n\t"
"wfminsb %%v0,%%v0,%%v16,0\n\t" "wfminsb %%v0,%%v0,%%v16,0\n\t"
"ler %0,%%f0 " "ler %[min],%%f0"
:"=f"(min) : [min] "=f"(min),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return min; return min;
} }
@ -98,7 +87,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG j = 0; BLASLONG j = 0;
FLOAT minf = 0.0; FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
@ -108,9 +98,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = smin_kernel_64(n1, x); minf = smin_kernel_64(n1, x);
i = n1; i = n1;
} } else {
else
{
minf = x[0]; minf = x[0];
i++; i++;
} }
@ -149,7 +137,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (j < n) { while (j < n) {
if (x[i] < minf) { if (x[i] < minf) {
minf = x[i]; minf = x[i];

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,25 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
{ __asm__("vlrepf %%v0,%[c]\n\t"
__asm__ ( "vlrepf %%v1,%[s]\n\t"
"vlrepf %%v0,%3 \n\t" "srlg %[n],%[n],6\n\t"
"vlrepf %%v1,%4 \n\t"
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%1) \n\t" "vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%1) \n\t" "vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%1) \n\t" "vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%1) \n\t" "vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[y])\n\t"
"vfmsb %%v28,%%v24,%%v0\n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -63,25 +60,22 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t" "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v28, 0(%%r1,%1) \n\t" "vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%1) \n\t" "vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%1) \n\t" "vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%1) \n\t" "vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v20, 0(%%r1,%2) \n\t" "vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%2) \n\t" "vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%2) \n\t" "vst %%v23, 48(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%2) \n\t" "vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v24, 64(%%r1,%1) \n\t" "vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%1) \n\t" "vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%1) \n\t" "vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v27, 112(%%r1,%1) \n\t" "vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v16, 64(%%r1,%2) \n\t" "vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%2) \n\t" "vl %%v19, 112(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0\n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -99,25 +93,22 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t" "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v28, 64(%%r1,%1) \n\t" "vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%1) \n\t" "vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%1) \n\t" "vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%1) \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v27, 176(%%r1,%1) \n\t" "vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v16, 128(%%r1,%2) \n\t" "vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%2) \n\t" "vl %%v19, 176(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0\n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -135,25 +126,22 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t" "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v28, 128(%%r1,%1) \n\t" "vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%1) \n\t" "vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%1) \n\t" "vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%1) \n\t" "vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v20, 128(%%r1,%2) \n\t" "vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%2) \n\t" "vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%2) \n\t" "vst %%v23, 176(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%2) \n\t" "vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v24, 192(%%r1,%1) \n\t" "vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%1) \n\t" "vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%1) \n\t" "vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v27, 240(%%r1,%1) \n\t" "vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v16, 192(%%r1,%2) \n\t" "vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%2) \n\t" "vl %%v19, 240(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0\n\t" "vfmsb %%v28,%%v24,%%v0\n\t"
"vfmsb %%v29,%%v25,%%v0\n\t" "vfmsb %%v29,%%v25,%%v0\n\t"
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -171,39 +159,38 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t" "vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v28, 192(%%r1,%1) \n\t" "vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%1) \n\t" "vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%1) \n\t" "vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%1) \n\t" "vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v20, 192(%%r1,%2) \n\t" "vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%2) \n\t" "vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%2) \n\t" "vst %%v23, 240(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) [n] "+&r"(n)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
); : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
{ FLOAT c, FLOAT s) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
FLOAT temp; FLOAT temp;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
if ( n1 > 0 ) if (n1 > 0) {
{
FLOAT cosa, sina; FLOAT cosa, sina;
cosa = c; cosa = c;
sina = s; sina = s;
@ -211,8 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
i = n1; i = n1;
} }
while(i < n) while (i < n) {
{
temp = c * x[i] + s * y[i]; temp = c * x[i] + s * y[i];
y[i] = c * y[i] - s * x[i]; y[i] = c * y[i] - s * x[i];
x[i] = temp; x[i] = temp;
@ -221,13 +207,9 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
} }
} else {
} while (i < n) {
else
{
while(i < n)
{
temp = c * x[ix] + s * y[iy]; temp = c * x[ix] + s * y[iy];
y[iy] = c * y[iy] - s * x[ix]; y[iy] = c * y[iy] - s * x[ix];
x[ix] = temp; x[ix] = temp;
@ -242,5 +224,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
return (0); return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,128 +27,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) {
{ __asm__("vlrepf %%v0,%[da]\n\t"
__asm__ volatile ( "srlg %[n],%[n],5\n\t"
"vlrepf %%v0,%1 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v24, 0(%%r1,%2) \n\t" "vl %%v24,0(%%r1,%[x])\n\t"
"vfmsb %%v24,%%v24,%%v0\n\t" "vfmsb %%v24,%%v24,%%v0\n\t"
"vst %%v24, 0(%%r1,%2) \n\t" "vst %%v24,0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%2) \n\t" "vl %%v25,16(%%r1,%[x])\n\t"
"vfmsb %%v25,%%v25,%%v0\n\t" "vfmsb %%v25,%%v25,%%v0\n\t"
"vst %%v25, 16(%%r1,%2) \n\t" "vst %%v25,16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%2) \n\t" "vl %%v26,32(%%r1,%[x])\n\t"
"vfmsb %%v26,%%v26,%%v0\n\t" "vfmsb %%v26,%%v26,%%v0\n\t"
"vst %%v26, 32(%%r1,%2) \n\t" "vst %%v26,32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%2) \n\t" "vl %%v27,48(%%r1,%[x])\n\t"
"vfmsb %%v27,%%v27,%%v0\n\t" "vfmsb %%v27,%%v27,%%v0\n\t"
"vst %%v27, 48(%%r1,%2) \n\t" "vst %%v27,48(%%r1,%[x])\n\t"
"vl %%v24, 64(%%r1,%2) \n\t" "vl %%v28,64(%%r1,%[x])\n\t"
"vfmsb %%v24,%%v24,%%v0 \n\t" "vfmsb %%v28,%%v28,%%v0\n\t"
"vst %%v24, 64(%%r1,%2) \n\t" "vst %%v28,64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%2) \n\t" "vl %%v29,80(%%r1,%[x])\n\t"
"vfmsb %%v25,%%v25,%%v0 \n\t" "vfmsb %%v29,%%v29,%%v0\n\t"
"vst %%v25, 80(%%r1,%2) \n\t" "vst %%v29,80(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%2) \n\t" "vl %%v30,96(%%r1,%[x])\n\t"
"vfmsb %%v26,%%v26,%%v0 \n\t" "vfmsb %%v30,%%v30,%%v0\n\t"
"vst %%v26, 96(%%r1,%2) \n\t" "vst %%v30,96(%%r1,%[x])\n\t"
"vl %%v27, 112(%%r1,%2) \n\t" "vl %%v31,112(%%r1,%[x])\n\t"
"vfmsb %%v27,%%v27,%%v0 \n\t" "vfmsb %%v31,%%v31,%%v0\n\t"
"vst %%v27, 112(%%r1,%2) \n\t" "vst %%v31,112(%%r1,%[x])\n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) : [x] "a"(x),[da] "Q"(da)
:"memory","cc","r0","r1","v0","v24","v25","v26","v27" : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
); "v31");
} }
static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) {
{ __asm__("vzero %%v0\n\t"
__asm__ volatile( "srlg %[n],%[n],5\n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v24,0(%%r1,%1) \n\t" "vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v25,16(%%r1,%1) \n\t" "vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v26,32(%%r1,%1) \n\t" "vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v27,48(%%r1,%1) \n\t" "vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v24,64(%%r1,%1) \n\t" "vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v25,80(%%r1,%1) \n\t" "vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v26,96(%%r1,%1) \n\t" "vst %%v0,112(%%r1,%[x])\n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
:"r"(n),"ZR"((FLOAT (*)[n])x) : [x] "a"(x)
:"memory","cc","r0","r1","v24","v25","v26","v27" : "cc", "r1", "v0");
);
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
{ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0, j = 0; BLASLONG i = 0, j = 0;
if (n <= 0 || inc_x <= 0) if (n <= 0 || inc_x <= 0)
return (0); return (0);
if (inc_x == 1) {
if ( inc_x == 1 ) if (da == 0.0) {
{
if ( da == 0.0 )
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{
sscal_kernel_32_zero(n1, x); sscal_kernel_32_zero(n1, x);
j = n1; j = n1;
} }
while(j < n) while (j < n) {
{
x[j] = 0.0; x[j] = 0.0;
j++; j++;
} }
} } else {
else
{
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if ( n1 > 0 ) if (n1 > 0) {
{
sscal_kernel_32(n1, da, x); sscal_kernel_32(n1, da, x);
j = n1; j = n1;
} }
while(j < n) while (j < n) {
{
x[j] = da * x[j]; x[j] = da * x[j];
j++; j++;
} }
} }
} else {
} if (da == 0.0) {
else
{
if ( da == 0.0 )
{
BLASLONG n1 = n & -2; BLASLONG n1 = n & -2;
@ -161,17 +139,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
j += 2; j += 2;
} }
while(j < n) while (j < n) {
{
x[i] = 0.0; x[i] = 0.0;
i += inc_x; i += inc_x;
j++; j++;
} }
} } else {
else
{
BLASLONG n1 = n & -2; BLASLONG n1 = n & -2;
while (j < n1) { while (j < n1) {
@ -184,8 +159,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
} }
while(j < n) while (j < n) {
{
x[i] = da * x[i]; x[i] = da * x[i];
i += inc_x; i += inc_x;
@ -197,5 +171,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
return 0; return 0;
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,111 +27,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],6\n\t"
__asm__ volatile(
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%1) \n\t" "vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%1) \n\t" "vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%1) \n\t" "vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%1) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%1) \n\t" "vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%1) \n\t" "vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%1) \n\t" "vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%1) \n\t" "vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%1) \n\t" "vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v0, 0(%%r1,%2) \n\t" "vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%2) \n\t" "vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%2) \n\t" "vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%2) \n\t" "vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%2) \n\t" "vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%2) \n\t" "vl %%v7, 112(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%2) \n\t" "vst %%v0, 0(%%r1,%[x])\n\t"
"vl %%v7, 112(%%r1,%2) \n\t" "vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v0, 0(%%r1,%1) \n\t" "vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%1) \n\t" "vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%1) \n\t" "vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%1) \n\t" "vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%1) \n\t" "vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%1) \n\t" "vst %%v7, 112(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%1) \n\t" "vl %%v0, 128(%%r1,%[y])\n\t"
"vst %%v7, 112(%%r1,%1) \n\t" "vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v0, 128(%%r1,%2) \n\t" "vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%2) \n\t" "vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%2) \n\t" "vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%2) \n\t" "vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%2) \n\t" "vl %%v7, 240(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%2) \n\t" "vst %%v0, 128(%%r1,%[x])\n\t"
"vl %%v6, 224(%%r1,%2) \n\t" "vst %%v1, 144(%%r1,%[x])\n\t"
"vl %%v7, 240(%%r1,%2) \n\t" "vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v0, 128(%%r1,%1) \n\t" "vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%1) \n\t" "vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%1) \n\t" "vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%1) \n\t" "vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%1) \n\t" "vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%1) \n\t" "vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v6, 224(%%r1,%1) \n\t" "vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v7, 240(%%r1,%1) \n\t" "vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v16, 0(%%r1,%2) \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%2) \n\t" "vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%2) \n\t" "vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%2) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%2) \n\t" "vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%2) \n\t" "vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%2) \n\t" "vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%2) \n\t" "vst %%v31, 240(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) [n] "+&r"(n)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : [x] "a"(x),[y] "a"(y)
); : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
{ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
FLOAT temp; FLOAT temp;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 )) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -64; BLASLONG n1 = n & -64;
if ( n1 > 0 ) if (n1 > 0) {
{
sswap_kernel_64(n1, x, y); sswap_kernel_64(n1, x, y);
i = n1; i = n1;
} }
while(i < n) while (i < n) {
{
temp = y[i]; temp = y[i];
y[i] = x[i]; y[i] = x[i];
x[i] = temp; x[i] = temp;
@ -139,13 +133,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
} }
} else {
} while (i < n) {
else
{
while(i < n)
{
temp = y[iy]; temp = y[iy];
y[iy] = x[ix]; y[iy] = x[ix];
x[ix] = temp; x[ix] = temp;
@ -158,7 +148,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
} }
return (0); return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,64 +28,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amax; FLOAT amax;
__asm__ volatile ( __asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v0,0(%2),0 \n\t" "vleg %%v16,8(%[x]),0\n\t"
"vleg %%v16,8(%2),0 \n\t" "vleg %%v0,16(%[x]),1\n\t"
"vleg %%v0,16(%2),1 \n\t" "vleg %%v16,24(%[x]),1\n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0\n\t" "vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t" "vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t" "vfadb %%v0,%%v0,%%v16\n\t"
"srlg %%r0,%1,4 \n\t" "srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t" "vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t" "vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t" "vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t" "vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t" "vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t" "vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t" "vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t" "vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t" "vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t" "vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t" "vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t" "vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t" "vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t" "vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t" "vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t" "vleg %%v24,128(%%r1,%[x]),0\n\t"
"vleg %%v24,128(%%r1,%2),0 \n\t" "vleg %%v25,136(%%r1,%[x]),0\n\t"
"vleg %%v25,136(%%r1,%2),0 \n\t" "vleg %%v24,144(%%r1,%[x]),1\n\t"
"vleg %%v24,144(%%r1,%2),1 \n\t" "vleg %%v25,152(%%r1,%[x]),1\n\t"
"vleg %%v25,152(%%r1,%2),1 \n\t" "vleg %%v26,160(%%r1,%[x]),0\n\t"
"vleg %%v26,160(%%r1,%2),0 \n\t" "vleg %%v27,168(%%r1,%[x]),0\n\t"
"vleg %%v27,168(%%r1,%2),0 \n\t" "vleg %%v26,176(%%r1,%[x]),1\n\t"
"vleg %%v26,176(%%r1,%2),1 \n\t" "vleg %%v27,184(%%r1,%[x]),1\n\t"
"vleg %%v27,184(%%r1,%2),1 \n\t" "vleg %%v28,192(%%r1,%[x]),0\n\t"
"vleg %%v28,192(%%r1,%2),0 \n\t" "vleg %%v29,200(%%r1,%[x]),0\n\t"
"vleg %%v29,200(%%r1,%2),0 \n\t" "vleg %%v28,208(%%r1,%[x]),1\n\t"
"vleg %%v28,208(%%r1,%2),1 \n\t" "vleg %%v29,216(%%r1,%[x]),1\n\t"
"vleg %%v29,216(%%r1,%2),1 \n\t" "vleg %%v30,224(%%r1,%[x]),0\n\t"
"vleg %%v30,224(%%r1,%2),0 \n\t" "vleg %%v31,232(%%r1,%[x]),0\n\t"
"vleg %%v31,232(%%r1,%2),0 \n\t" "vleg %%v30,240(%%r1,%[x]),1\n\t"
"vleg %%v30,240(%%r1,%2),1 \n\t" "vleg %%v31,248(%%r1,%[x]),1\n\t"
"vleg %%v31,248(%%r1,%2),1 \n\t"
"vflpdb %%v16,%%v16\n\t" "vflpdb %%v16,%%v16\n\t"
"vflpdb %%v17,%%v17\n\t" "vflpdb %%v17,%%v17\n\t"
"vflpdb %%v18,%%v18\n\t" "vflpdb %%v18,%%v18\n\t"
@ -102,7 +92,6 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
"vflpdb %%v29,%%v29\n\t" "vflpdb %%v29,%%v29\n\t"
"vflpdb %%v30,%%v30\n\t" "vflpdb %%v30,%%v30\n\t"
"vflpdb %%v31,%%v31\n\t" "vflpdb %%v31,%%v31\n\t"
"vfadb %%v16,%%v16,%%v17\n\t" "vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v18,%%v18,%%v19\n\t" "vfadb %%v18,%%v18,%%v19\n\t"
"vfadb %%v20,%%v20,%%v21\n\t" "vfadb %%v20,%%v20,%%v21\n\t"
@ -111,29 +100,23 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
"vfadb %%v26,%%v26,%%v27\n\t" "vfadb %%v26,%%v26,%%v27\n\t"
"vfadb %%v28,%%v28,%%v29\n\t" "vfadb %%v28,%%v28,%%v29\n\t"
"vfadb %%v30,%%v30,%%v31\n\t" "vfadb %%v30,%%v30,%%v31\n\t"
"vfmaxdb %%v16,%%v16,%%v24,0\n\t" "vfmaxdb %%v16,%%v16,%%v24,0\n\t"
"vfmaxdb %%v18,%%v18,%%v26,0\n\t" "vfmaxdb %%v18,%%v18,%%v26,0\n\t"
"vfmaxdb %%v20,%%v20,%%v28,0\n\t" "vfmaxdb %%v20,%%v20,%%v28,0\n\t"
"vfmaxdb %%v22,%%v22,%%v30,0\n\t" "vfmaxdb %%v22,%%v22,%%v30,0\n\t"
"vfmaxdb %%v16,%%v16,%%v20,0\n\t" "vfmaxdb %%v16,%%v16,%%v20,0\n\t"
"vfmaxdb %%v18,%%v18,%%v22,0\n\t" "vfmaxdb %%v18,%%v18,%%v22,0\n\t"
"vfmaxdb %%v16,%%v16,%%v18,0\n\t" "vfmaxdb %%v16,%%v16,%%v18,0\n\t"
"vfmaxdb %%v0,%%v0,%%v16,0\n\t" "vfmaxdb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t" "vrepg %%v16,%%v0,1\n\t"
"wfmaxdb %%v0,%%v0,%%v16,0\n\t" "wfmaxdb %%v0,%%v0,%%v16,0\n\t"
"ldr %0,%%f0 " "ldr %[amax],%%f0"
:"=f"(amax) : [amax] "=f"(amax),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amax; return amax;
} }
@ -144,7 +127,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
@ -154,9 +138,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = zamax_kernel_16(n1, x); maxf = zamax_kernel_16(n1, x);
ix = n1 * 2; ix = n1 * 2;
i = n1; i = n1;
} } else {
else
{
maxf = CABS1(x, 0); maxf = CABS1(x, 0);
ix += 2; ix += 2;
i++; i++;
@ -198,7 +180,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (i < n) { while (i < n) {
if (CABS1(x, ix) > maxf) { if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix); maxf = CABS1(x, ix);

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,47 +28,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amax; FLOAT amax;
__asm__ volatile ( __asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v0,0(%2),0 \n\t" "vleg %%v16,8(%[x]),0\n\t"
"vleg %%v16,8(%2),0 \n\t" "vleg %%v0,16(%[x]),1\n\t"
"vleg %%v0,16(%2),1 \n\t" "vleg %%v16,24(%[x]),1\n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0\n\t" "vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t" "vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t" "vfadb %%v0,%%v0,%%v16\n\t"
"srlg %%r0,%1,4 \n\t" "srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t" "vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t" "vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t" "vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t" "vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t" "vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t" "vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t" "vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t" "vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t" "vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t" "vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t" "vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t" "vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t" "vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t" "vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t" "vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -81,34 +72,30 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
"vfadb %%v17,%%v18,%%v19\n\t" "vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t" "vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t" "vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v25,%%v18,%%v19\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t" "vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t" "vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v24,%%v25\n\t" "vfchdb %%v26,%%v24,%%v25\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t" "vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v26,%%v0\n\t" "vfchdb %%v27,%%v26,%%v0\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t" "vsel %%v0,%%v26,%%v0,%%v27\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v16,128(%%r1,%2),0 \n\t" "vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t" "vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t" "vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t" "vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t" "vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t" "vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t" "vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t" "vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t" "vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t" "vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t" "vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t" "vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t" "vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t" "vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t" "vleg %%v23,248(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -121,29 +108,24 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
"vfadb %%v17,%%v18,%%v19\n\t" "vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t" "vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t" "vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v24,%%v16,%%v17\n\t"
"vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v25,%%v18,%%v19\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t" "vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t" "vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v24,%%v25\n\t" "vfchdb %%v26,%%v24,%%v25\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t" "vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v26,%%v0\n\t" "vfchdb %%v27,%%v26,%%v0\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t" "vsel %%v0,%%v26,%%v0,%%v27\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t" "vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v0,%%v16\n\t" "wfchdb %%v17,%%v0,%%v16\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t" "vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %0,%%f0 " "ldr %[amax],%%f0"
:"=f"(amax) : [amax] "=f"(amax),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27");
return amax; return amax;
} }
@ -154,7 +136,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT maxf = 0.0; FLOAT maxf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (maxf); if (n <= 0 || inc_x <= 0)
return (maxf);
if (inc_x == 1) { if (inc_x == 1) {
@ -164,9 +147,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
maxf = zamax_kernel_16(n1, x); maxf = zamax_kernel_16(n1, x);
ix = n1 * 2; ix = n1 * 2;
i = n1; i = n1;
} } else {
else
{
maxf = CABS1(x, 0); maxf = CABS1(x, 0);
ix += 2; ix += 2;
i++; i++;
@ -208,7 +189,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (i < n) { while (i < n) {
if (CABS1(x, ix) > maxf) { if (CABS1(x, ix) > maxf) {
maxf = CABS1(x, ix); maxf = CABS1(x, ix);

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,64 +28,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amin; FLOAT amin;
__asm__ volatile ( __asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v0,0(%2),0 \n\t" "vleg %%v16,8(%[x]),0\n\t"
"vleg %%v16,8(%2),0 \n\t" "vleg %%v0,16(%[x]),1\n\t"
"vleg %%v0,16(%2),1 \n\t" "vleg %%v16,24(%[x]),1\n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0\n\t" "vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t" "vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t" "vfadb %%v0,%%v0,%%v16\n\t"
"srlg %%r0,%1,4 \n\t" "srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t" "vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t" "vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t" "vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t" "vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t" "vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t" "vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t" "vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t" "vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t" "vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t" "vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t" "vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t" "vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t" "vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t" "vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t" "vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t" "vleg %%v24,128(%%r1,%[x]),0\n\t"
"vleg %%v24,128(%%r1,%2),0 \n\t" "vleg %%v25,136(%%r1,%[x]),0\n\t"
"vleg %%v25,136(%%r1,%2),0 \n\t" "vleg %%v24,144(%%r1,%[x]),1\n\t"
"vleg %%v24,144(%%r1,%2),1 \n\t" "vleg %%v25,152(%%r1,%[x]),1\n\t"
"vleg %%v25,152(%%r1,%2),1 \n\t" "vleg %%v26,160(%%r1,%[x]),0\n\t"
"vleg %%v26,160(%%r1,%2),0 \n\t" "vleg %%v27,168(%%r1,%[x]),0\n\t"
"vleg %%v27,168(%%r1,%2),0 \n\t" "vleg %%v26,176(%%r1,%[x]),1\n\t"
"vleg %%v26,176(%%r1,%2),1 \n\t" "vleg %%v27,184(%%r1,%[x]),1\n\t"
"vleg %%v27,184(%%r1,%2),1 \n\t" "vleg %%v28,192(%%r1,%[x]),0\n\t"
"vleg %%v28,192(%%r1,%2),0 \n\t" "vleg %%v29,200(%%r1,%[x]),0\n\t"
"vleg %%v29,200(%%r1,%2),0 \n\t" "vleg %%v28,208(%%r1,%[x]),1\n\t"
"vleg %%v28,208(%%r1,%2),1 \n\t" "vleg %%v29,216(%%r1,%[x]),1\n\t"
"vleg %%v29,216(%%r1,%2),1 \n\t" "vleg %%v30,224(%%r1,%[x]),0\n\t"
"vleg %%v30,224(%%r1,%2),0 \n\t" "vleg %%v31,232(%%r1,%[x]),0\n\t"
"vleg %%v31,232(%%r1,%2),0 \n\t" "vleg %%v30,240(%%r1,%[x]),1\n\t"
"vleg %%v30,240(%%r1,%2),1 \n\t" "vleg %%v31,248(%%r1,%[x]),1\n\t"
"vleg %%v31,248(%%r1,%2),1 \n\t"
"vflpdb %%v16,%%v16\n\t" "vflpdb %%v16,%%v16\n\t"
"vflpdb %%v17,%%v17\n\t" "vflpdb %%v17,%%v17\n\t"
"vflpdb %%v18,%%v18\n\t" "vflpdb %%v18,%%v18\n\t"
@ -102,7 +92,6 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
"vflpdb %%v29,%%v29\n\t" "vflpdb %%v29,%%v29\n\t"
"vflpdb %%v30,%%v30\n\t" "vflpdb %%v30,%%v30\n\t"
"vflpdb %%v31,%%v31\n\t" "vflpdb %%v31,%%v31\n\t"
"vfadb %%v16,%%v16,%%v17\n\t" "vfadb %%v16,%%v16,%%v17\n\t"
"vfadb %%v18,%%v18,%%v19\n\t" "vfadb %%v18,%%v18,%%v19\n\t"
"vfadb %%v20,%%v20,%%v21\n\t" "vfadb %%v20,%%v20,%%v21\n\t"
@ -111,29 +100,23 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
"vfadb %%v26,%%v26,%%v27\n\t" "vfadb %%v26,%%v26,%%v27\n\t"
"vfadb %%v28,%%v28,%%v29\n\t" "vfadb %%v28,%%v28,%%v29\n\t"
"vfadb %%v30,%%v30,%%v31\n\t" "vfadb %%v30,%%v30,%%v31\n\t"
"vfmindb %%v16,%%v16,%%v24,0\n\t" "vfmindb %%v16,%%v16,%%v24,0\n\t"
"vfmindb %%v18,%%v18,%%v26,0\n\t" "vfmindb %%v18,%%v18,%%v26,0\n\t"
"vfmindb %%v20,%%v20,%%v28,0\n\t" "vfmindb %%v20,%%v20,%%v28,0\n\t"
"vfmindb %%v22,%%v22,%%v30,0\n\t" "vfmindb %%v22,%%v22,%%v30,0\n\t"
"vfmindb %%v16,%%v16,%%v20,0\n\t" "vfmindb %%v16,%%v16,%%v20,0\n\t"
"vfmindb %%v18,%%v18,%%v22,0\n\t" "vfmindb %%v18,%%v18,%%v22,0\n\t"
"vfmindb %%v16,%%v16,%%v18,0\n\t" "vfmindb %%v16,%%v16,%%v18,0\n\t"
"vfmindb %%v0,%%v0,%%v16,0\n\t" "vfmindb %%v0,%%v0,%%v16,0\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t" "vrepg %%v16,%%v0,1\n\t"
"wfmindb %%v0,%%v0,%%v16,0\n\t" "wfmindb %%v0,%%v0,%%v16,0\n\t"
"ldr %0,%%f0 " "ldr %[amin],%%f0"
:"=f"(amin) : [amin] "=f"(amin),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return amin; return amin;
} }
@ -144,7 +127,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
@ -154,9 +138,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = zamin_kernel_16(n1, x); minf = zamin_kernel_16(n1, x);
ix = n1 * 2; ix = n1 * 2;
i = n1; i = n1;
} } else {
else
{
minf = CABS1(x, 0); minf = CABS1(x, 0);
ix += 2; ix += 2;
i++; i++;
@ -198,7 +180,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (i < n) { while (i < n) {
if (CABS1(x, ix) < minf) { if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix); minf = CABS1(x, ix);

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,47 +28,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE) #define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amin; FLOAT amin;
__asm__ volatile ( __asm__("vleg %%v0,0(%[x]),0\n\t"
"vleg %%v0,0(%2),0 \n\t" "vleg %%v16,8(%[x]),0\n\t"
"vleg %%v16,8(%2),0 \n\t" "vleg %%v0,16(%[x]),1\n\t"
"vleg %%v0,16(%2),1 \n\t" "vleg %%v16,24(%[x]),1\n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0\n\t" "vflpdb %%v0,%%v0\n\t"
"vflpdb %%v16,%%v16\n\t" "vflpdb %%v16,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16\n\t" "vfadb %%v0,%%v0,%%v16\n\t"
"srlg %%r0,%1,4 \n\t" "srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vleg %%v16,0(%%r1,%[x]),0\n\t"
"vleg %%v16,0(%%r1,%2),0 \n\t" "vleg %%v17,8(%%r1,%[x]),0\n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t" "vleg %%v16,16(%%r1,%[x]),1\n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t" "vleg %%v17,24(%%r1,%[x]),1\n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t" "vleg %%v18,32(%%r1,%[x]),0\n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t" "vleg %%v19,40(%%r1,%[x]),0\n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t" "vleg %%v18,48(%%r1,%[x]),1\n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t" "vleg %%v19,56(%%r1,%[x]),1\n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t" "vleg %%v20,64(%%r1,%[x]),0\n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t" "vleg %%v21,72(%%r1,%[x]),0\n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t" "vleg %%v20,80(%%r1,%[x]),1\n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t" "vleg %%v21,88(%%r1,%[x]),1\n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t" "vleg %%v22,96(%%r1,%[x]),0\n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t" "vleg %%v23,104(%%r1,%[x]),0\n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t" "vleg %%v22,112(%%r1,%[x]),1\n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t" "vleg %%v23,120(%%r1,%[x]),1\n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -81,34 +72,30 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
"vfadb %%v17,%%v18,%%v19\n\t" "vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t" "vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t" "vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v25,%%v19,%%v18\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t" "vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t" "vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v25,%%v24\n\t" "vfchdb %%v26,%%v25,%%v24\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t" "vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v0,%%v26\n\t" "vfchdb %%v27,%%v0,%%v26\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t" "vsel %%v0,%%v26,%%v0,%%v27\n\t"
"vleg %%v16,128(%%r1,%[x]),0\n\t"
"vleg %%v16,128(%%r1,%2),0 \n\t" "vleg %%v17,136(%%r1,%[x]),0\n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t" "vleg %%v16,144(%%r1,%[x]),1\n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t" "vleg %%v17,152(%%r1,%[x]),1\n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t" "vleg %%v18,160(%%r1,%[x]),0\n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t" "vleg %%v19,168(%%r1,%[x]),0\n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t" "vleg %%v18,176(%%r1,%[x]),1\n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t" "vleg %%v19,184(%%r1,%[x]),1\n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t" "vleg %%v20,192(%%r1,%[x]),0\n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t" "vleg %%v21,200(%%r1,%[x]),0\n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t" "vleg %%v20,208(%%r1,%[x]),1\n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t" "vleg %%v21,216(%%r1,%[x]),1\n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t" "vleg %%v22,224(%%r1,%[x]),0\n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t" "vleg %%v23,232(%%r1,%[x]),0\n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t" "vleg %%v22,240(%%r1,%[x]),1\n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t" "vleg %%v23,248(%%r1,%[x]),1\n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -121,29 +108,24 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
"vfadb %%v17,%%v18,%%v19\n\t" "vfadb %%v17,%%v18,%%v19\n\t"
"vfadb %%v18,%%v20,%%v21\n\t" "vfadb %%v18,%%v20,%%v21\n\t"
"vfadb %%v19,%%v22,%%v23\n\t" "vfadb %%v19,%%v22,%%v23\n\t"
"vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v24,%%v17,%%v16\n\t"
"vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v25,%%v19,%%v18\n\t"
"vsel %%v24,%%v16,%%v17,%%v24\n\t" "vsel %%v24,%%v16,%%v17,%%v24\n\t"
"vsel %%v25,%%v18,%%v19,%%v25\n\t" "vsel %%v25,%%v18,%%v19,%%v25\n\t"
"vfchdb %%v26,%%v25,%%v24\n\t" "vfchdb %%v26,%%v25,%%v24\n\t"
"vsel %%v26,%%v24,%%v25,%%v26\n\t" "vsel %%v26,%%v24,%%v25,%%v26\n\t"
"vfchdb %%v27,%%v0,%%v26\n\t" "vfchdb %%v27,%%v0,%%v26\n\t"
"vsel %%v0,%%v26,%%v0,%%v27\n\t" "vsel %%v0,%%v26,%%v0,%%v27\n\t"
"agfi %%r1, 256\n\t" "agfi %%r1, 256\n\t"
"brctg %%r0, 0b \n\t" "brctg %[n], 0b\n\t"
"vrepg %%v16,%%v0,1\n\t" "vrepg %%v16,%%v0,1\n\t"
"wfchdb %%v17,%%v16,%%v0\n\t" "wfchdb %%v17,%%v16,%%v0\n\t"
"vsel %%v0,%%v0,%%v16,%%v17\n\t" "vsel %%v0,%%v0,%%v16,%%v17\n\t"
"ldr %0,%%f0 " "ldr %[amin],%%f0"
:"=f"(amin) : [amin] "=f"(amin),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n])x) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
); "v23", "v24", "v25", "v26", "v27");
return amin; return amin;
} }
@ -154,7 +136,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
FLOAT minf = 0.0; FLOAT minf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return (minf); if (n <= 0 || inc_x <= 0)
return (minf);
if (inc_x == 1) { if (inc_x == 1) {
@ -164,9 +147,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
minf = zamin_kernel_16(n1, x); minf = zamin_kernel_16(n1, x);
ix = n1 * 2; ix = n1 * 2;
i = n1; i = n1;
} } else {
else
{
minf = CABS1(x, 0); minf = CABS1(x, 0);
ix += 2; ix += 2;
i++; i++;
@ -208,7 +189,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
} }
while (i < n) { while (i < n) {
if (CABS1(x, ix) < minf) { if (CABS1(x, ix) < minf) {
minf = CABS1(x, ix); minf = CABS1(x, ix);

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -28,34 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#include <math.h> #include <math.h>
#if defined(DOUBLE)
#define ABS fabs #define ABS fabs
#else
#define ABS fabsf
#endif
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
{
FLOAT asum; FLOAT asum;
__asm__ ( __asm__("vzero %%v24\n\t"
"vzero %%v0 \n\t" "vzero %%v25\n\t"
"vzero %%v1 \n\t" "vzero %%v26\n\t"
"vzero %%v2 \n\t" "vzero %%v27\n\t"
"vzero %%v3 \n\t" "vzero %%v28\n\t"
"srlg %%r0,%1,4 \n\t" "vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%2) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%2) \n\t" "vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%2) \n\t" "vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%2) \n\t" "vl %%v23, 112(%%r1,%[x])\n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -64,25 +61,22 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t" "vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t" "vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t" "vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16 \n\t" "vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v1,%%v1,%%v17 \n\t" "vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v2,%%v2,%%v18 \n\t" "vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v3,%%v3,%%v19 \n\t" "vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v0,%%v0,%%v20 \n\t" "vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v1,%%v1,%%v21 \n\t" "vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v2,%%v2,%%v22 \n\t" "vfadb %%v31,%%v31,%%v23\n\t"
"vfadb %%v3,%%v3,%%v23 \n\t" "vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v16, 128(%%r1,%2) \n\t" "vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%2) \n\t" "vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%2) \n\t" "vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%2) \n\t" "vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%2) \n\t" "vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%2) \n\t" "vl %%v23, 240(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16\n\t" "vflpdb %%v16, %%v16\n\t"
"vflpdb %%v17, %%v17\n\t" "vflpdb %%v17, %%v17\n\t"
"vflpdb %%v18, %%v18\n\t" "vflpdb %%v18, %%v18\n\t"
@ -91,68 +85,64 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
"vflpdb %%v21, %%v21\n\t" "vflpdb %%v21, %%v21\n\t"
"vflpdb %%v22, %%v22\n\t" "vflpdb %%v22, %%v22\n\t"
"vflpdb %%v23, %%v23\n\t" "vflpdb %%v23, %%v23\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v0,%%v0,%%v16 \n\t" "vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v1,%%v1,%%v17 \n\t" "vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v2,%%v2,%%v18 \n\t" "vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v3,%%v3,%%v19 \n\t" "vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v0,%%v0,%%v20 \n\t" "vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v1,%%v1,%%v21 \n\t" "vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v2,%%v2,%%v22 \n\t" "vfadb %%v31,%%v31,%%v23\n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfadb %%v0,%%v0,%%v1 \n\t" "vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v0,%%v0,%%v2 \n\t" "vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v0,%%v0,%%v3 \n\t" "vfadb %%v24,%%v24,%%v27\n\t"
"vrepg %%v1,%%v0,1 \n\t" "vfadb %%v24,%%v24,%%v28\n\t"
"adbr %%f0,%%f1 \n\t" "vfadb %%v24,%%v24,%%v29\n\t"
"ldr %0,%%f0 " "vfadb %%v24,%%v24,%%v30\n\t"
:"=f"(asum) "vfadb %%v24,%%v24,%%v31\n\t"
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x) "vrepg %%v25,%%v24,1\n\t"
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" "vfadb %%v24,%%v24,%%v25\n\t"
); "vsteg %%v24,%[asum],0"
: [asum] "=Q"(asum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return asum; return asum;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
{
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ip = 0; BLASLONG ip = 0;
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
BLASLONG n1; BLASLONG n1;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf); if (n <= 0 || inc_x <= 0)
return (sumf);
if ( inc_x == 1 ) if (inc_x == 1) {
{
n1 = n & -16; n1 = n & -16;
if ( n1 > 0 ) if (n1 > 0) {
{
sumf = zasum_kernel_16(n1, x); sumf = zasum_kernel_16(n1, x);
i = n1; i = n1;
ip = 2 * n1; ip = 2 * n1;
} }
while(i < n) while (i < n) {
{
sumf += ABS(x[ip]) + ABS(x[ip + 1]); sumf += ABS(x[ip]) + ABS(x[ip + 1]);
i++; i++;
ip += 2; ip += 2;
} }
} } else {
else
{
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
while(i < n) while (i < n) {
{
sumf += ABS(x[ip]) + ABS(x[ip + 1]); sumf += ABS(x[ip]) + ABS(x[ip + 1]);
ip += inc_x2; ip += inc_x2;
i++; i++;
@ -161,5 +151,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
} }
return (sumf); return (sumf);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,96 +27,91 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
{ __asm__(
__asm__ volatile(
#if !defined(CONJ) #if !defined(CONJ)
"vlrepg %%v0,0(%3) \n\t" "vlrepg %%v0,0(%[alpha])\n\t"
"vleg %%v1,8(%3),0 \n\t" "vleg %%v1,8(%[alpha]),0\n\t"
"wflcdb %%v1,%%v1\n\t" "wflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%3),1 \n\t" "vleg %%v1,8(%[alpha]),1\n\t"
#else #else
"vleg %%v0,0(%3),1 \n\t" "vleg %%v0,0(%[alpha]),1\n\t"
"vflcdb %%v0,%%v0\n\t" "vflcdb %%v0,%%v0\n\t"
"vleg %%v0,0(%3),0 \n\t" "vleg %%v0,0(%[alpha]),0\n\t"
"vlrepg %%v1,8(%3) \n\t" "vlrepg %%v1,8(%[alpha])\n\t"
#endif #endif
"srlg %%r0,%0,3 \n\t" "srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v8,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v9,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%1) \n\t" "vl %%v10,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%1) \n\t" "vl %%v11,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%1) \n\t" "vl %%v12,0(%%r1,%[y])\n\t"
"vl %%v20,0(%%r1,%2) \n\t" "vl %%v13,16(%%r1,%[y])\n\t"
"vl %%v21,16(%%r1,%2) \n\t" "vl %%v14,32(%%r1,%[y])\n\t"
"vl %%v22,32(%%r1,%2) \n\t" "vl %%v15,48(%%r1,%[y])\n\t"
"vl %%v23,48(%%r1,%2) \n\t" "vl %%v16,64(%%r1,%[x])\n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t" "vl %%v17,80(%%r1,%[x])\n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t" "vl %%v18,96(%%r1,%[x])\n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t" "vl %%v19,112(%%r1,%[x])\n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t" "vl %%v20,64(%%r1,%[y])\n\t"
"vl %%v21,80(%%r1,%[y])\n\t"
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" "vl %%v22,96(%%r1,%[y])\n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" "vl %%v23,112(%%r1,%[y])\n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" "vpdi %%v24,%%v8,%%v8,4\n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" "vpdi %%v25,%%v9,%%v9,4\n\t"
"vpdi %%v26,%%v10,%%v10,4\n\t"
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" "vpdi %%v27,%%v11,%%v11,4\n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" "vpdi %%v28,%%v16,%%v16,4\n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" "vpdi %%v29,%%v17,%%v17,4\n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" "vpdi %%v30,%%v18,%%v18,4\n\t"
"vpdi %%v31,%%v19,%%v19,4\n\t"
"vst %%v28,0(%%r1,%2) \n\t" "vfmadb %%v8,%%v8,%%v0,%%v12\n\t"
"vst %%v29,16(%%r1,%2) \n\t" "vfmadb %%v9,%%v9,%%v0,%%v13\n\t"
"vst %%v30,32(%%r1,%2) \n\t" "vfmadb %%v10,%%v10,%%v0,%%v14\n\t"
"vst %%v31,48(%%r1,%2) \n\t" "vfmadb %%v11,%%v11,%%v0,%%v15\n\t"
"vfmadb %%v16,%%v16,%%v0,%%v20\n\t"
"vl %%v16,64(%%r1,%1) \n\t" "vfmadb %%v17,%%v17,%%v0,%%v21\n\t"
"vl %%v17,80(%%r1,%1) \n\t" "vfmadb %%v18,%%v18,%%v0,%%v22\n\t"
"vl %%v18,96(%%r1,%1) \n\t" "vfmadb %%v19,%%v19,%%v0,%%v23\n\t"
"vl %%v19,112(%%r1,%1) \n\t" "vfmadb %%v8,%%v24,%%v1,%%v8\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vfmadb %%v9,%%v25,%%v1,%%v9\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vfmadb %%v10,%%v26,%%v1,%%v10\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vfmadb %%v11,%%v27,%%v1,%%v11\n\t"
"vl %%v23,112(%%r1,%2) \n\t" "vfmadb %%v16,%%v28,%%v1,%%v16\n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t" "vfmadb %%v17,%%v29,%%v1,%%v17\n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t" "vfmadb %%v18,%%v30,%%v1,%%v18\n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t" "vfmadb %%v19,%%v31,%%v1,%%v19\n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t" "vst %%v8,0(%%r1,%[y])\n\t"
"vst %%v9,16(%%r1,%[y])\n\t"
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" "vst %%v10,32(%%r1,%[y])\n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" "vst %%v11,48(%%r1,%[y])\n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" "vst %%v16,64(%%r1,%[y])\n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" "vst %%v17,80(%%r1,%[y])\n\t"
"vst %%v18,96(%%r1,%[y])\n\t"
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" "vst %%v19,112(%%r1,%[y])\n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"
"vst %%v28,64(%%r1,%2) \n\t"
"vst %%v29,80(%%r1,%2) \n\t"
"vst %%v30,96(%%r1,%2) \n\t"
"vst %%v31,112(%%r1,%2) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
); : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
FLOAT da[2] __attribute__ ((aligned(16))); FLOAT da[2] __attribute__ ((aligned(16)));
if (n <= 0) return (0); if (n <= 0)
return (0);
if ((inc_x == 1) && (inc_y == 1)) { if ((inc_x == 1) && (inc_y == 1)) {
@ -143,7 +138,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
} }
return (0); return (0);
} }
inc_x *= 2; inc_x *= 2;
@ -166,5 +160,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
return (0); return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,46 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],4\n\t"
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,4 \n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1) \n\t" "pfd 1, 1024(%[x])\n\t"
"pfd 2, 1024(%%r2) \n\t" "pfd 2, 1024(%[y])\n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t" "mvc 0(256,%[y]),0(%[x])\n\t"
"agfi %%r1,256 \n\t" "la %[x],256(%[x])\n\t"
"agfi %%r2,256 \n\t" "la %[y],256(%[y])\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) [n] "+&r"(n)
:"memory","cc","r0","r1","r2" : "m"(*(const struct { FLOAT x[n * 2]; } *) x)
); : "cc");
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
{
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 )) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if (n1 > 0) {
{
zcopy_kernel_16(n1, x, y); zcopy_kernel_16(n1, x, y);
i = n1; i = n1;
ix = n1 * 2; ix = n1 * 2;
iy = n1 * 2; iy = n1 * 2;
} }
while(i < n) while (i < n) {
{
y[iy] = x[iy]; y[iy] = x[iy];
y[iy + 1] = x[ix + 1]; y[iy + 1] = x[ix + 1];
ix += 2; ix += 2;
@ -75,16 +68,12 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
} }
} else {
}
else
{
BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y; BLASLONG inc_y2 = 2 * inc_y;
while(i < n) while (i < n) {
{
y[iy] = x[ix]; y[iy] = x[ix];
y[iy + 1] = x[ix + 1]; y[iy + 1] = x[ix + 1];
ix += inc_x2; ix += inc_x2;

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,10 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
{ __asm__("vzero %%v24\n\t"
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25\n\t" "vzero %%v25\n\t"
"vzero %%v26\n\t" "vzero %%v26\n\t"
"vzero %%v27\n\t" "vzero %%v27\n\t"
@ -38,25 +36,23 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vzero %%v29\n\t" "vzero %%v29\n\t"
"vzero %%v30\n\t" "vzero %%v30\n\t"
"vzero %%v31\n\t" "vzero %%v31\n\t"
"srlg %%r0,%0,3 \n\t" "srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1, 1024(%%r1,%1) \n\t" "pfd 1, 1024(%%r1,%[x])\n\t"
"pfd 1, 1024(%%r1,%2) \n\t" "pfd 1, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v0, 0(%%r1,%2) \n\t" "vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%2) \n\t" "vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%2) \n\t" "vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t" "vpdi %%v23,%%v19,%%v19,4\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t" "vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t" "vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
@ -65,20 +61,18 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t" "vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t" "vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t" "vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
"vl %%v16, 64(%%r1,%[x])\n\t"
"vl %%v16, 64(%%r1,%1) \n\t" "vl %%v17, 80(%%r1,%[x])\n\t"
"vl %%v17, 80(%%r1,%1) \n\t" "vl %%v18, 96(%%r1,%[x])\n\t"
"vl %%v18, 96(%%r1,%1) \n\t" "vl %%v19, 112(%%r1,%[x])\n\t"
"vl %%v19, 112(%%r1,%1) \n\t" "vl %%v0, 64(%%r1,%[y])\n\t"
"vl %%v0, 64(%%r1,%2) \n\t" "vl %%v1, 80(%%r1,%[y])\n\t"
"vl %%v1, 80(%%r1,%2) \n\t" "vl %%v2, 96(%%r1,%[y])\n\t"
"vl %%v2, 96(%%r1,%2) \n\t" "vl %%v3, 112(%%r1,%[y])\n\t"
"vl %%v3, 112(%%r1,%2) \n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t" "vpdi %%v23,%%v19,%%v19,4\n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t" "vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t" "vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
@ -87,30 +81,33 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t" "vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t" "vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t" "vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v26\n\t" "vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v28\n\t" "vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v30\n\t" "vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v25,%%v25,%%v27\n\t" "vfadb %%v25,%%v25,%%v27\n\t"
"vfadb %%v25,%%v25,%%v29\n\t" "vfadb %%v25,%%v25,%%v29\n\t"
"vfadb %%v25,%%v25,%%v31\n\t" "vfadb %%v25,%%v25,%%v31\n\t"
"vsteg %%v24,0(%3),0 \n\t" "vsteg %%v24,0(%[d]),0\n\t"
"vsteg %%v24,8(%3),1 \n\t" "vsteg %%v24,8(%[d]),1\n\t"
"vsteg %%v25,16(%3),1 \n\t" "vsteg %%v25,16(%[d]),1\n\t"
"vsteg %%v25,24(%3),0 " "vsteg %%v25,24(%[d]),0"
: : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
); : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y) {
BLASLONG i; BLASLONG i;
BLASLONG ix, iy; BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result; OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; FLOAT dot[4] __attribute__ ((aligned(16))) = {
0.0, 0.0, 0.0, 0.0};
if (n <= 0) { if (n <= 0) {
CREAL(result) = 0.0; CREAL(result) = 0.0;
@ -141,7 +138,6 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
} }
} else { } else {
i = 0; i = 0;
ix = 0; ix = 0;
@ -174,5 +170,3 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
return (result); return (result);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2014, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -25,276 +25,259 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "common.h" #include "common.h"
#define NBMAX 1024 #define NBMAX 1024
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
{ register FLOAT *ap0 = ap[0];
__asm__ volatile ( register FLOAT *ap1 = ap[1];
"vl %%v16,0(%5) \n\t" register FLOAT *ap2 = ap[2];
"vl %%v17,16(%5) \n\t" register FLOAT *ap3 = ap[3];
"vl %%v18,32(%5) \n\t"
"vl %%v19,48(%5) \n\t" __asm__("vl %%v16,0(%[x])\n\t"
"vl %%v17,16(%[x])\n\t"
"vl %%v18,32(%[x])\n\t"
"vl %%v19,48(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v20,8(%5),0 \n\t" "vleg %%v20,8(%[x]),0\n\t"
"wflcdb %%v20,%%v20\n\t" "wflcdb %%v20,%%v20\n\t"
"vleg %%v20,0(%5),1 \n\t" "vleg %%v20,0(%[x]),1\n\t"
"vleg %%v21,24(%5),0 \n\t" "vleg %%v21,24(%[x]),0\n\t"
"wflcdb %%v21,%%v21\n\t" "wflcdb %%v21,%%v21\n\t"
"vleg %%v21,16(%5),1 \n\t" "vleg %%v21,16(%[x]),1\n\t"
"vleg %%v22,40(%5),0 \n\t" "vleg %%v22,40(%[x]),0\n\t"
"wflcdb %%v22,%%v22\n\t" "wflcdb %%v22,%%v22\n\t"
"vleg %%v22,32(%5),1 \n\t" "vleg %%v22,32(%[x]),1\n\t"
"vleg %%v23,56(%5),0 \n\t" "vleg %%v23,56(%[x]),0\n\t"
"wflcdb %%v23,%%v23\n\t" "wflcdb %%v23,%%v23\n\t"
"vleg %%v23,48(%5),1 \n\t" "vleg %%v23,48(%[x]),1\n\t"
#else #else
"vleg %%v20,0(%5),1 \n\t" "vleg %%v20,0(%[x]),1\n\t"
"vflcdb %%v20,%%v20\n\t" "vflcdb %%v20,%%v20\n\t"
"vleg %%v20,8(%5),0 \n\t" "vleg %%v20,8(%[x]),0\n\t"
"vleg %%v21,16(%5),1 \n\t" "vleg %%v21,16(%[x]),1\n\t"
"vflcdb %%v21,%%v21\n\t" "vflcdb %%v21,%%v21\n\t"
"vleg %%v21,24(%5),0 \n\t" "vleg %%v21,24(%[x]),0\n\t"
"vleg %%v22,32(%5),1 \n\t" "vleg %%v22,32(%[x]),1\n\t"
"vflcdb %%v22,%%v22\n\t" "vflcdb %%v22,%%v22\n\t"
"vleg %%v22,40(%5),0 \n\t" "vleg %%v22,40(%[x]),0\n\t"
"vleg %%v23,48(%5),1 \n\t" "vleg %%v23,48(%[x]),1\n\t"
"vflcdb %%v23,%%v23\n\t" "vflcdb %%v23,%%v23\n\t"
"vleg %%v23,56(%5),0 \n\t" "vleg %%v23,56(%[x]),0\n\t"
#endif #endif
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t" "srlg %[n],%[n],1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%4) \n\t" "pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 2,1024(%%r1,%6) \n\t" "pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vlrepg %%v24,0(%%r1,%1) \n\t" "vl %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v25,8(%%r1,%1) \n\t" "vlrepg %%v24,0(%%r1,%[ap0])\n\t"
"vlrepg %%v26,0(%%r1,%2) \n\t" "vlrepg %%v25,8(%%r1,%[ap0])\n\t"
"vlrepg %%v27,8(%%r1,%2) \n\t" "vlrepg %%v26,0(%%r1,%[ap1])\n\t"
"vlrepg %%v27,8(%%r1,%[ap1])\n\t"
"vl %%v0,0(%%r1,%6) \n\t" "vlrepg %%v28,16(%%r1,%[ap0])\n\t"
"vlrepg %%v29,24(%%r1,%[ap0])\n\t"
"vlrepg %%v30,16(%%r1,%[ap1])\n\t"
"vlrepg %%v31,24(%%r1,%[ap1])\n\t"
"vfmadb %%v0,%%v24,%%v16,%%v0\n\t" "vfmadb %%v0,%%v24,%%v16,%%v0\n\t"
"vfmadb %%v1,%%v28,%%v16,%%v1\n\t"
"vfmadb %%v0,%%v25,%%v20,%%v0\n\t" "vfmadb %%v0,%%v25,%%v20,%%v0\n\t"
"vfmadb %%v1,%%v29,%%v20,%%v1\n\t"
"vfmadb %%v0,%%v26,%%v17,%%v0\n\t" "vfmadb %%v0,%%v26,%%v17,%%v0\n\t"
"vfmadb %%v1,%%v30,%%v17,%%v1\n\t"
"vfmadb %%v0,%%v27,%%v21,%%v0\n\t" "vfmadb %%v0,%%v27,%%v21,%%v0\n\t"
"vfmadb %%v1,%%v31,%%v21,%%v1\n\t"
"vlrepg %%v28,0(%%r1,%3) \n\t" "vlrepg %%v24,0(%%r1,%[ap2])\n\t"
"vlrepg %%v29,8(%%r1,%3) \n\t" "vlrepg %%v25,8(%%r1,%[ap2])\n\t"
"vlrepg %%v30,0(%%r1,%4) \n\t" "vlrepg %%v26,0(%%r1,%[ap3])\n\t"
"vlrepg %%v31,8(%%r1,%4) \n\t" "vlrepg %%v27,8(%%r1,%[ap3])\n\t"
"vlrepg %%v28,16(%%r1,%[ap2])\n\t"
"vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" "vlrepg %%v29,24(%%r1,%[ap2])\n\t"
"vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" "vlrepg %%v30,16(%%r1,%[ap3])\n\t"
"vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" "vlrepg %%v31,24(%%r1,%[ap3])\n\t"
"vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" "vfmadb %%v0,%%v24,%%v18,%%v0\n\t"
"vst %%v0,0(%%r1,%6) \n\t" "vfmadb %%v1,%%v28,%%v18,%%v1\n\t"
"vfmadb %%v0,%%v25,%%v22,%%v0\n\t"
"vlrepg %%v24,16(%%r1,%1) \n\t" "vfmadb %%v1,%%v29,%%v22,%%v1\n\t"
"vlrepg %%v25,24(%%r1,%1) \n\t" "vfmadb %%v0,%%v26,%%v19,%%v0\n\t"
"vlrepg %%v26,16(%%r1,%2) \n\t" "vfmadb %%v1,%%v30,%%v19,%%v1\n\t"
"vlrepg %%v27,24(%%r1,%2) \n\t" "vfmadb %%v0,%%v27,%%v23,%%v0\n\t"
"vfmadb %%v1,%%v31,%%v23,%%v1\n\t"
"vl %%v0,16(%%r1,%6) \n\t" "vst %%v0,0(%%r1,%[y])\n\t"
"vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" "vst %%v1,16(%%r1,%[y])\n\t"
"vfmadb %%v0,%%v25,%%v20,%%v0 \n\t"
"vfmadb %%v0,%%v26,%%v17,%%v0 \n\t"
"vfmadb %%v0,%%v27,%%v21,%%v0 \n\t"
"vlrepg %%v28,16(%%r1,%3) \n\t"
"vlrepg %%v29,24(%%r1,%3) \n\t"
"vlrepg %%v30,16(%%r1,%4) \n\t"
"vlrepg %%v31,24(%%r1,%4) \n\t"
"vfmadb %%v0,%%v28,%%v18,%%v0 \n\t"
"vfmadb %%v0,%%v29,%%v22,%%v0 \n\t"
"vfmadb %%v0,%%v30,%%v19,%%v0 \n\t"
"vfmadb %%v0,%%v31,%%v23,%%v0 \n\t"
"vst %%v0,16(%%r1,%6) \n\t"
"agfi %%r1,32\n\t" "agfi %%r1,32\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
); "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
{ register FLOAT *ap0 = ap[0];
__asm__ volatile ( register FLOAT *ap1 = ap[1];
"vl %%v16,0(%3) \n\t"
"vl %%v17,16(%3) \n\t" __asm__("vl %%v16,0(%[x])\n\t"
"vl %%v17,16(%[x])\n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v18,8(%3),0 \n\t" "vleg %%v18,8(%[x]),0\n\t"
"wflcdb %%v18,%%v18\n\t" "wflcdb %%v18,%%v18\n\t"
"vleg %%v18,0(%3),1 \n\t" "vleg %%v18,0(%[x]),1\n\t"
"vleg %%v19,24(%3),0 \n\t" "vleg %%v19,24(%[x]),0\n\t"
"wflcdb %%v19,%%v19\n\t" "wflcdb %%v19,%%v19\n\t"
"vleg %%v19,16(%3),1 \n\t" "vleg %%v19,16(%[x]),1\n\t"
#else #else
"vleg %%v18,0(%3),1 \n\t" "vleg %%v18,0(%[x]),1\n\t"
"vflcdb %%v18,%%v18\n\t" "vflcdb %%v18,%%v18\n\t"
"vleg %%v18,8(%3),0 \n\t" "vleg %%v18,8(%[x]),0\n\t"
"vleg %%v19,16(%3),1 \n\t" "vleg %%v19,16(%[x]),1\n\t"
"vflcdb %%v19,%%v19\n\t" "vflcdb %%v19,%%v19\n\t"
"vleg %%v19,24(%3),0 \n\t" "vleg %%v19,24(%[x]),0\n\t"
#endif #endif
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t" "srlg %[n],%[n],1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 2,1024(%%r1,%4) \n\t" "pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vlrepg %%v20,0(%%r1,%1) \n\t" "vl %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v21,8(%%r1,%1) \n\t" "vlrepg %%v20,0(%%r1,%[ap0])\n\t"
"vlrepg %%v22,0(%%r1,%2) \n\t" "vlrepg %%v21,8(%%r1,%[ap0])\n\t"
"vlrepg %%v23,8(%%r1,%2) \n\t" "vlrepg %%v22,0(%%r1,%[ap1])\n\t"
"vlrepg %%v23,8(%%r1,%[ap1])\n\t"
"vl %%v0,0(%%r1,%4) \n\t" "vlrepg %%v24,16(%%r1,%[ap0])\n\t"
"vlrepg %%v25,24(%%r1,%[ap0])\n\t"
"vlrepg %%v26,16(%%r1,%[ap1])\n\t"
"vlrepg %%v27,24(%%r1,%[ap1])\n\t"
"vfmadb %%v0,%%v20,%%v16,%%v0\n\t" "vfmadb %%v0,%%v20,%%v16,%%v0\n\t"
"vfmadb %%v1,%%v24,%%v16,%%v1\n\t"
"vfmadb %%v0,%%v21,%%v18,%%v0\n\t" "vfmadb %%v0,%%v21,%%v18,%%v0\n\t"
"vfmadb %%v1,%%v25,%%v18,%%v1\n\t"
"vfmadb %%v0,%%v22,%%v17,%%v0\n\t" "vfmadb %%v0,%%v22,%%v17,%%v0\n\t"
"vfmadb %%v1,%%v26,%%v17,%%v1\n\t"
"vfmadb %%v0,%%v23,%%v19,%%v0\n\t" "vfmadb %%v0,%%v23,%%v19,%%v0\n\t"
"vst %%v0,0(%%r1,%4) \n\t" "vfmadb %%v1,%%v27,%%v19,%%v1\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"vlrepg %%v20,16(%%r1,%1) \n\t" "vst %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v21,24(%%r1,%1) \n\t"
"vlrepg %%v22,16(%%r1,%2) \n\t"
"vlrepg %%v23,24(%%r1,%2) \n\t"
"vl %%v0,16(%%r1,%4) \n\t"
"vfmadb %%v0,%%v20,%%v16,%%v0 \n\t"
"vfmadb %%v0,%%v21,%%v18,%%v0 \n\t"
"vfmadb %%v0,%%v22,%%v17,%%v0 \n\t"
"vfmadb %%v0,%%v23,%%v19,%%v0 \n\t"
"vst %%v0,16(%%r1,%4) \n\t"
"agfi %%r1,32\n\t" "agfi %%r1,32\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
); "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27");
} }
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
{ __asm__("vl %%v16,0(%[x])\n\t"
__asm__ volatile (
"vl %%v16,0(%2) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v17,8(%2),0 \n\t" "vleg %%v17,8(%[x]),0\n\t"
"wflcdb %%v17,%%v17\n\t" "wflcdb %%v17,%%v17\n\t"
"vleg %%v17,0(%2),1 \n\t" "vleg %%v17,0(%[x]),1\n\t"
#else #else
"vleg %%v17,0(%2),1 \n\t" "vleg %%v17,0(%[x]),1\n\t"
"vflcdb %%v17,%%v17\n\t" "vflcdb %%v17,%%v17\n\t"
"vleg %%v17,8(%2),0 \n\t" "vleg %%v17,8(%[x]),0\n\t"
#endif #endif
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t" "srlg %[n],%[n],1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 2,1024(%%r1,%3) \n\t" "pfd 2,1024(%%r1,%[y])\n\t"
"vl %%v0,0(%%r1,%[y])\n\t"
"vlrepg %%v18,0(%%r1,%1) \n\t" "vl %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v19,8(%%r1,%1) \n\t" "vlrepg %%v18,0(%%r1,%[ap])\n\t"
"vlrepg %%v19,8(%%r1,%[ap])\n\t"
"vl %%v0,0(%%r1,%3) \n\t" "vlrepg %%v20,16(%%r1,%[ap])\n\t"
"vlrepg %%v21,24(%%r1,%[ap])\n\t"
"vfmadb %%v0,%%v18,%%v16,%%v0\n\t" "vfmadb %%v0,%%v18,%%v16,%%v0\n\t"
"vfmadb %%v1,%%v20,%%v16,%%v1\n\t"
"vfmadb %%v0,%%v19,%%v17,%%v0\n\t" "vfmadb %%v0,%%v19,%%v17,%%v0\n\t"
"vst %%v0,0(%%r1,%3) \n\t" "vfmadb %%v1,%%v21,%%v17,%%v1\n\t"
"vst %%v0,0(%%r1,%[y])\n\t"
"vlrepg %%v18,16(%%r1,%1) \n\t" "vst %%v1,16(%%r1,%[y])\n\t"
"vlrepg %%v19,24(%%r1,%1) \n\t"
"vl %%v0,16(%%r1,%3) \n\t"
"vfmadb %%v0,%%v18,%%v16,%%v0 \n\t"
"vfmadb %%v0,%%v19,%%v17,%%v0 \n\t"
"vst %%v0,16(%%r1,%3) \n\t"
"agfi %%r1,32\n\t" "agfi %%r1,32\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
:"memory","cc","r0","r1","v0","v16","v17","v18","v19" "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x)
); : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21");
} }
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r,
{ FLOAT alpha_i) {
__asm__ volatile ( __asm__(
#if !defined(XCONJ) #if !defined(XCONJ)
"vlrepg %%v0,%3 \n\t" "vlrepg %%v0,%[alpha_r]\n\t"
"vleg %%v1,%4,0 \n\t" "vleg %%v1,%[alpha_i],0\n\t"
"wflcdb %%v1,%%v1\n\t" "wflcdb %%v1,%%v1\n\t"
"vleg %%v1,%4,1 \n\t" "vleg %%v1,%[alpha_i],1\n\t"
#else #else
"vleg %%v0,%3,1 \n\t" "vleg %%v0,%[alpha_r],1\n\t"
"vflcdb %%v0,%%v0\n\t" "vflcdb %%v0,%%v0\n\t"
"vleg %%v0,%3,0 \n\t" "vleg %%v0,%[alpha_r],0\n\t"
"vlrepg %%v1,%4 \n\t" "vlrepg %%v1,%[alpha_i]\n\t"
#endif #endif
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,2 \n\t" "srlg %[n],%[n],2\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[src])\n\t"
"pfd 2,1024(%%r1,%2) \n\t" "pfd 2,1024(%%r1,%[dest])\n\t"
"vl %%v16,0(%%r1,%[src])\n\t"
"vl %%v16,0(%%r1,%1) \n\t" "vl %%v17,16(%%r1,%[src])\n\t"
"vl %%v17,16(%%r1,%1) \n\t" "vl %%v18,32(%%r1,%[src])\n\t"
"vl %%v18,32(%%r1,%1) \n\t" "vl %%v19,48(%%r1,%[src])\n\t"
"vl %%v19,48(%%r1,%1) \n\t" "vl %%v20,0(%%r1,%[dest])\n\t"
"vl %%v20,0(%%r1,%2) \n\t" "vl %%v21,16(%%r1,%[dest])\n\t"
"vl %%v21,16(%%r1,%2) \n\t" "vl %%v22,32(%%r1,%[dest])\n\t"
"vl %%v22,32(%%r1,%2) \n\t" "vl %%v23,48(%%r1,%[dest])\n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t"
"vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t"
"vpdi %%v26,%%v18,%%v18,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t"
"vpdi %%v27,%%v19,%%v19,4\n\t" "vpdi %%v27,%%v19,%%v19,4\n\t"
"vfmadb %%v28,%%v16,%%v0,%%v20\n\t" "vfmadb %%v28,%%v16,%%v0,%%v20\n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21\n\t" "vfmadb %%v29,%%v17,%%v0,%%v21\n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22\n\t" "vfmadb %%v30,%%v18,%%v0,%%v22\n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23\n\t" "vfmadb %%v31,%%v19,%%v0,%%v23\n\t"
"vfmadb %%v28,%%v24,%%v1,%%v28\n\t" "vfmadb %%v28,%%v24,%%v1,%%v28\n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29\n\t" "vfmadb %%v29,%%v25,%%v1,%%v29\n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30\n\t" "vfmadb %%v30,%%v26,%%v1,%%v30\n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31\n\t" "vfmadb %%v31,%%v27,%%v1,%%v31\n\t"
"vst %%v28,0(%%r1,%[dest])\n\t"
"vst %%v28,0(%%r1,%2) \n\t" "vst %%v29,16(%%r1,%[dest])\n\t"
"vst %%v29,16(%%r1,%2) \n\t" "vst %%v30,32(%%r1,%[dest])\n\t"
"vst %%v30,32(%%r1,%2) \n\t" "vst %%v31,48(%%r1,%[dest])\n\t"
"vst %%v31,48(%%r1,%2) \n\t"
"agfi %%r1,64\n\t" "agfi %%r1,64\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src),
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i)
); : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,
{ FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i; BLASLONG i;
if ( inc_dest != 2 ) if (inc_dest != 2) {
{
FLOAT temp_r; FLOAT temp_r;
FLOAT temp_i; FLOAT temp_i;
for ( i=0; i<n; i++ ) for (i = 0; i < n; i++) {
{
#if !defined(XCONJ) #if !defined(XCONJ)
temp_r = alpha_r * src[0] - alpha_i * src[1]; temp_r = alpha_r * src[0] - alpha_i * src[1];
temp_i = alpha_r * src[1] + alpha_i * src[0]; temp_i = alpha_r * src[1] + alpha_i * src[0];
@ -315,8 +298,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
add_y_4(n, src, dest, alpha_r, alpha_i); add_y_4(n, src, dest, alpha_r, alpha_i);
} }
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
{ FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i; BLASLONG i;
FLOAT *a_ptr; FLOAT *a_ptr;
FLOAT *x_ptr; FLOAT *x_ptr;
@ -330,8 +314,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
BLASLONG lda4; BLASLONG lda4;
FLOAT xbuffer[8], *ybuffer; FLOAT xbuffer[8], *ybuffer;
if ( m < 1 ) return(0); if (m < 1)
if ( n < 1 ) return(0); return (0);
if (n < 1)
return (0);
ybuffer = buffer; ybuffer = buffer;
@ -351,13 +337,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
BLASLONG NB = NBMAX; BLASLONG NB = NBMAX;
while ( NB == NBMAX ) while (NB == NBMAX) {
{
m1 -= NB; m1 -= NB;
if ( m1 < 0) if (m1 < 0) {
{ if (m2 == 0)
if ( m2 == 0 ) break; break;
NB = m2; NB = m2;
} }
@ -370,11 +355,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
//zero_y(NB,ybuffer); //zero_y(NB,ybuffer);
memset(ybuffer, 0, NB * 16); memset(ybuffer, 0, NB * 16);
if ( inc_x == 2 ) if (inc_x == 2) {
{
for( i = 0; i < n1 ; i++) for (i = 0; i < n1; i++) {
{
zgemv_kernel_4x4(NB, ap, x_ptr, ybuffer); zgemv_kernel_4x4(NB, ap, x_ptr, ybuffer);
ap[0] += lda4; ap[0] += lda4;
ap[1] += lda4; ap[1] += lda4;
@ -384,27 +367,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 8; x_ptr += 8;
} }
if ( n2 & 2 ) if (n2 & 2) {
{
zgemv_kernel_4x2(NB, ap, x_ptr, ybuffer); zgemv_kernel_4x2(NB, ap, x_ptr, ybuffer);
x_ptr += 4; x_ptr += 4;
a_ptr += 2 * lda; a_ptr += 2 * lda;
} }
if ( n2 & 1 ) if (n2 & 1) {
{
zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
/* x_ptr += 2; /* x_ptr += 2;
a_ptr += lda; */ a_ptr += lda; */
} }
} } else {
else
{
for( i = 0; i < n1 ; i++) for (i = 0; i < n1; i++) {
{
xbuffer[0] = x_ptr[0]; xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1]; xbuffer[1] = x_ptr[1];
@ -427,8 +405,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
a_ptr += lda4; a_ptr += lda4;
} }
for( i = 0; i < n2 ; i++) for (i = 0; i < n2; i++) {
{
xbuffer[0] = x_ptr[0]; xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1]; xbuffer[1] = x_ptr[1];
x_ptr += inc_x; x_ptr += inc_x;
@ -444,21 +421,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
y_ptr += NB * inc_y; y_ptr += NB * inc_y;
} }
if ( m3 == 0 ) return(0); if (m3 == 0)
return (0);
if ( m3 == 1 ) if (m3 == 1) {
{
a_ptr = a; a_ptr = a;
x_ptr = x; x_ptr = x;
FLOAT temp_r = 0.0; FLOAT temp_r = 0.0;
FLOAT temp_i = 0.0; FLOAT temp_i = 0.0;
if ( lda == 2 && inc_x == 2 ) if (lda == 2 && inc_x == 2) {
{
for (i = 0; i < (n & -2); i += 2) {
for( i=0 ; i < (n & -2); i+=2 )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -475,10 +449,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 4; x_ptr += 4;
} }
for (; i < n; i++) {
for( ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -491,13 +462,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 2; x_ptr += 2;
} }
} else {
} for (i = 0; i < n; i++) {
else
{
for( i = 0; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -521,8 +488,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
return (0); return (0);
} }
if ( m3 == 2 ) if (m3 == 2) {
{
a_ptr = a; a_ptr = a;
x_ptr = x; x_ptr = x;
FLOAT temp_r0 = 0.0; FLOAT temp_r0 = 0.0;
@ -530,11 +496,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
FLOAT temp_r1 = 0.0; FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0; FLOAT temp_i1 = 0.0;
if ( lda == 4 && inc_x == 2 ) if (lda == 4 && inc_x == 2) {
{
for( i = 0; i < (n & -2); i+=2 ) for (i = 0; i < (n & -2); i += 2) {
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
@ -564,9 +528,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 4; x_ptr += 4;
} }
for (; i < n; i++) {
for( ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -583,13 +545,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 2; x_ptr += 2;
} }
} else {
} for (i = 0; i < n; i++) {
else
{
for( i=0 ; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -606,7 +564,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += inc_x; x_ptr += inc_x;
} }
} }
#if !defined(XCONJ) #if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
@ -624,9 +581,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
return (0); return (0);
} }
if (m3 == 3) {
if ( m3 == 3 )
{
a_ptr = a; a_ptr = a;
x_ptr = x; x_ptr = x;
FLOAT temp_r0 = 0.0; FLOAT temp_r0 = 0.0;
@ -636,11 +591,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
FLOAT temp_r2 = 0.0; FLOAT temp_r2 = 0.0;
FLOAT temp_i2 = 0.0; FLOAT temp_i2 = 0.0;
if ( lda == 6 && inc_x == 2 ) if (lda == 6 && inc_x == 2) {
{
for( i=0 ; i < n; i++ ) for (i = 0; i < n; i++) {
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
@ -661,13 +614,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
x_ptr += 2; x_ptr += 2;
} }
} else {
} for (i = 0; i < n; i++) {
else
{
for( i = 0; i < n; i++ )
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2014, The OpenBLAS Project Copyright (c) 2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -29,106 +29,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NBMAX 1024 #define NBMAX 1024
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
{ FLOAT *alpha) {
__asm__ volatile ( register FLOAT *ap0 = ap[0];
"vzero %%v16 \n\t" register FLOAT *ap1 = ap[1];
register FLOAT *ap2 = ap[2];
register FLOAT *ap3 = ap[3];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t" "vzero %%v17\n\t"
"vzero %%v18\n\t" "vzero %%v18\n\t"
"vzero %%v19\n\t" "vzero %%v19\n\t"
"vzero %%v20\n\t"
"vzero %%v21\n\t"
"vzero %%v22\n\t"
"vzero %%v23\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t" "srlg %[n],%[n],1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "pfd 1,1024(%%r1,%[ap2])\n\t"
"pfd 1,1024(%%r1,%4) \n\t" "pfd 1,1024(%%r1,%[ap3])\n\t"
"pfd 1,1024(%%r1,%5) \n\t" "pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
"vl %%v20,0(%%r1,%5) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v21,8(%%r1,%5),0 \n\t" "vleg %%v1,8(%%r1,%[x]),0\n\t"
"wflcdb %%v21,%%v21 \n\t" "wflcdb %%v1,%%v1\n\t"
"vleg %%v21,0(%%r1,%5),1 \n\t" "vleg %%v1,0(%%r1,%[x]),1\n\t"
#else #else
"vleg %%v21,0(%%r1,%5),1 \n\t" "vleg %%v1,0(%%r1,%[x]),1\n\t"
"vflcdb %%v21,%%v21 \n\t" "vflcdb %%v1,%%v1\n\t"
"vleg %%v21,8(%%r1,%5),0 \n\t" "vleg %%v1,8(%%r1,%[x]),0\n\t"
#endif #endif
"vlrepg %%v24,0(%%r1,%[ap0])\n\t"
"vlrepg %%v24,0(%%r1,%1) \n\t" "vlrepg %%v25,8(%%r1,%[ap0])\n\t"
"vlrepg %%v25,8(%%r1,%1) \n\t" "vlrepg %%v26,0(%%r1,%[ap1])\n\t"
"vlrepg %%v26,0(%%r1,%2) \n\t" "vlrepg %%v27,8(%%r1,%[ap1])\n\t"
"vlrepg %%v27,8(%%r1,%2) \n\t" "vlrepg %%v28,0(%%r1,%[ap2])\n\t"
"vlrepg %%v29,8(%%r1,%[ap2])\n\t"
"vfmadb %%v16,%%v24,%%v20,%%v16 \n\t" "vlrepg %%v30,0(%%r1,%[ap3])\n\t"
"vfmadb %%v16,%%v25,%%v21,%%v16 \n\t" "vlrepg %%v31,8(%%r1,%[ap3])\n\t"
"vfmadb %%v17,%%v26,%%v20,%%v17 \n\t" "vfmadb %%v16,%%v24,%%v0,%%v16\n\t"
"vfmadb %%v17,%%v27,%%v21,%%v17 \n\t" "vfmadb %%v20,%%v25,%%v1,%%v20\n\t"
"vfmadb %%v17,%%v26,%%v0,%%v17\n\t"
"vlrepg %%v28,0(%%r1,%3) \n\t" "vfmadb %%v21,%%v27,%%v1,%%v21\n\t"
"vlrepg %%v29,8(%%r1,%3) \n\t" "vfmadb %%v18,%%v28,%%v0,%%v18\n\t"
"vlrepg %%v30,0(%%r1,%4) \n\t" "vfmadb %%v22,%%v29,%%v1,%%v22\n\t"
"vlrepg %%v31,8(%%r1,%4) \n\t" "vfmadb %%v19,%%v30,%%v0,%%v19\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"vfmadb %%v18,%%v28,%%v20,%%v18 \n\t" "vl %%v0,16(%%r1,%[x])\n\t"
"vfmadb %%v18,%%v29,%%v21,%%v18 \n\t"
"vfmadb %%v19,%%v30,%%v20,%%v19 \n\t"
"vfmadb %%v19,%%v31,%%v21,%%v19 \n\t"
"vl %%v22,16(%%r1,%5) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v23,24(%%r1,%5),0 \n\t" "vleg %%v1,24(%%r1,%[x]),0\n\t"
"wflcdb %%v23,%%v23 \n\t" "wflcdb %%v1,%%v1\n\t"
"vleg %%v23,16(%%r1,%5),1 \n\t" "vleg %%v1,16(%%r1,%[x]),1\n\t"
#else #else
"vleg %%v23,16(%%r1,%5),1 \n\t" "vleg %%v1,16(%%r1,%[x]),1\n\t"
"vflcdb %%v23,%%v23 \n\t" "vflcdb %%v1,%%v1\n\t"
"vleg %%v23,24(%%r1,%5),0 \n\t" "vleg %%v1,24(%%r1,%[x]),0\n\t"
#endif #endif
"vlrepg %%v24,16(%%r1,%[ap0])\n\t"
"vlrepg %%v24,16(%%r1,%1) \n\t" "vlrepg %%v25,24(%%r1,%[ap0])\n\t"
"vlrepg %%v25,24(%%r1,%1) \n\t" "vlrepg %%v26,16(%%r1,%[ap1])\n\t"
"vlrepg %%v26,16(%%r1,%2) \n\t" "vlrepg %%v27,24(%%r1,%[ap1])\n\t"
"vlrepg %%v27,24(%%r1,%2) \n\t" "vlrepg %%v28,16(%%r1,%[ap2])\n\t"
"vlrepg %%v29,24(%%r1,%[ap2])\n\t"
"vfmadb %%v16,%%v24,%%v22,%%v16 \n\t" "vlrepg %%v30,16(%%r1,%[ap3])\n\t"
"vfmadb %%v16,%%v25,%%v23,%%v16 \n\t" "vlrepg %%v31,24(%%r1,%[ap3])\n\t"
"vfmadb %%v17,%%v26,%%v22,%%v17 \n\t" "vfmadb %%v16,%%v24,%%v0,%%v16\n\t"
"vfmadb %%v17,%%v27,%%v23,%%v17 \n\t" "vfmadb %%v20,%%v25,%%v1,%%v20\n\t"
"vfmadb %%v17,%%v26,%%v0,%%v17\n\t"
"vlrepg %%v28,16(%%r1,%3) \n\t" "vfmadb %%v21,%%v27,%%v1,%%v21\n\t"
"vlrepg %%v29,24(%%r1,%3) \n\t" "vfmadb %%v18,%%v28,%%v0,%%v18\n\t"
"vlrepg %%v30,16(%%r1,%4) \n\t" "vfmadb %%v22,%%v29,%%v1,%%v22\n\t"
"vlrepg %%v31,24(%%r1,%4) \n\t" "vfmadb %%v19,%%v30,%%v0,%%v19\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"vfmadb %%v18,%%v28,%%v22,%%v18 \n\t"
"vfmadb %%v18,%%v29,%%v23,%%v18 \n\t"
"vfmadb %%v19,%%v30,%%v22,%%v19 \n\t"
"vfmadb %%v19,%%v31,%%v23,%%v19 \n\t"
"agfi %%r1,32\n\t" "agfi %%r1,32\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfadb %%v16,%%v16,%%v20\n\t"
"vfadb %%v17,%%v17,%%v21\n\t"
"vfadb %%v18,%%v18,%%v22\n\t"
"vfadb %%v19,%%v19,%%v23\n\t"
"vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t"
"vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t"
"vpdi %%v22,%%v18,%%v18,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t"
"vpdi %%v23,%%v19,%%v19,4\n\t" "vpdi %%v23,%%v19,%%v19,4\n\t"
#if !defined(XCONJ) #if !defined(XCONJ)
"vlrepg %%v24,0(%7) \n\t" "vlrepg %%v24,0(%[alpha])\n\t"
"vleg %%v25,8(%7),0 \n\t" "vleg %%v25,8(%[alpha]),0\n\t"
"wflcdb %%v25,%%v25\n\t" "wflcdb %%v25,%%v25\n\t"
"vleg %%v25,8(%7),1 \n\t" "vleg %%v25,8(%[alpha]),1\n\t"
#else #else
"vleg %%v24,0(%7),1 \n\t" "vleg %%v24,0(%[alpha]),1\n\t"
"vflcdb %%v24,%%v24\n\t" "vflcdb %%v24,%%v24\n\t"
"vleg %%v24,0(%7),0 \n\t" "vleg %%v24,0(%[alpha]),0\n\t"
"vlrepg %%v25,8(%7) \n\t" "vlrepg %%v25,8(%[alpha])\n\t"
#endif #endif
"vl %%v26,0(%6) \n\t" "vl %%v26,0(%[y])\n\t"
"vl %%v27,16(%6) \n\t" "vl %%v27,16(%[y])\n\t"
"vl %%v28,32(%6) \n\t" "vl %%v28,32(%[y])\n\t"
"vl %%v29,48(%6) \n\t" "vl %%v29,48(%[y])\n\t"
"vfmadb %%v26,%%v16,%%v24,%%v26\n\t" "vfmadb %%v26,%%v16,%%v24,%%v26\n\t"
"vfmadb %%v26,%%v20,%%v25,%%v26\n\t" "vfmadb %%v26,%%v20,%%v25,%%v26\n\t"
"vfmadb %%v27,%%v17,%%v24,%%v27\n\t" "vfmadb %%v27,%%v17,%%v24,%%v27\n\t"
@ -137,174 +137,173 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *
"vfmadb %%v28,%%v22,%%v25,%%v28\n\t" "vfmadb %%v28,%%v22,%%v25,%%v28\n\t"
"vfmadb %%v29,%%v19,%%v24,%%v29\n\t" "vfmadb %%v29,%%v19,%%v24,%%v29\n\t"
"vfmadb %%v29,%%v23,%%v25,%%v29\n\t" "vfmadb %%v29,%%v23,%%v25,%%v29\n\t"
"vst %%v26,0(%6) \n\t" "vst %%v26,0(%[y])\n\t"
"vst %%v27,16(%6) \n\t" "vst %%v27,16(%[y])\n\t"
"vst %%v28,32(%6) \n\t" "vst %%v28,32(%[y])\n\t"
"vst %%v29,48(%6) " "vst %%v29,48(%[y])"
: : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[8])y),"ZQ"((const FLOAT (*)[2])alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
); "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
{ FLOAT *alpha) {
__asm__ volatile ( register FLOAT *ap0 = ap[0];
"vzero %%v16 \n\t" register FLOAT *ap1 = ap[1];
__asm__("vzero %%v16\n\t"
"vzero %%v17\n\t" "vzero %%v17\n\t"
"vzero %%v18\n\t"
"vzero %%v19\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t" "srlg %[n],%[n],1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap0])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[ap1])\n\t"
"pfd 1,1024(%%r1,%3) \n\t" "pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
"vl %%v18,0(%%r1,%3) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v19,8(%%r1,%3),0 \n\t" "vleg %%v1,8(%%r1,%[x]),0\n\t"
"wflcdb %%v19,%%v19 \n\t" "wflcdb %%v1,%%v1\n\t"
"vleg %%v19,0(%%r1,%3),1 \n\t" "vleg %%v1,0(%%r1,%[x]),1\n\t"
#else #else
"vleg %%v19,0(%%r1,%3),1 \n\t" "vleg %%v1,0(%%r1,%[x]),1\n\t"
"vflcdb %%v19,%%v19 \n\t" "vflcdb %%v1,%%v1\n\t"
"vleg %%v19,8(%%r1,%3),0 \n\t" "vleg %%v1,8(%%r1,%[x]),0\n\t"
#endif #endif
"vlrepg %%v20,0(%%r1,%[ap0])\n\t"
"vlrepg %%v20,0(%%r1,%1) \n\t" "vlrepg %%v21,8(%%r1,%[ap0])\n\t"
"vlrepg %%v21,8(%%r1,%1) \n\t" "vlrepg %%v22,0(%%r1,%[ap1])\n\t"
"vlrepg %%v22,0(%%r1,%2) \n\t" "vlrepg %%v23,8(%%r1,%[ap1])\n\t"
"vlrepg %%v23,8(%%r1,%2) \n\t" "vfmadb %%v16,%%v20,%%v0,%%v16\n\t"
"vfmadb %%v18,%%v21,%%v1,%%v18\n\t"
"vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" "vfmadb %%v17,%%v22,%%v0,%%v17\n\t"
"vfmadb %%v16,%%v21,%%v19,%%v16 \n\t" "vfmadb %%v19,%%v23,%%v1,%%v19\n\t"
"vfmadb %%v17,%%v22,%%v18,%%v17 \n\t" "vl %%v0,16(%%r1,%[x])\n\t"
"vfmadb %%v17,%%v23,%%v19,%%v17 \n\t"
"vl %%v18,16(%%r1,%3) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v19,24(%%r1,%3),0 \n\t" "vleg %%v1,24(%%r1,%[x]),0\n\t"
"wflcdb %%v19,%%v19 \n\t" "wflcdb %%v1,%%v1\n\t"
"vleg %%v19,16(%%r1,%3),1 \n\t" "vleg %%v1,16(%%r1,%[x]),1\n\t"
#else #else
"vleg %%v19,16(%%r1,%3),1 \n\t" "vleg %%v1,16(%%r1,%[x]),1\n\t"
"vflcdb %%v19,%%v19 \n\t" "vflcdb %%v1,%%v1\n\t"
"vleg %%v19,24(%%r1,%3),0 \n\t" "vleg %%v1,24(%%r1,%[x]),0\n\t"
#endif #endif
"vlrepg %%v20,16(%%r1,%[ap0])\n\t"
"vlrepg %%v20,16(%%r1,%1) \n\t" "vlrepg %%v21,24(%%r1,%[ap0])\n\t"
"vlrepg %%v21,24(%%r1,%1) \n\t" "vlrepg %%v22,16(%%r1,%[ap1])\n\t"
"vlrepg %%v22,16(%%r1,%2) \n\t" "vlrepg %%v23,24(%%r1,%[ap1])\n\t"
"vlrepg %%v23,24(%%r1,%2) \n\t" "vfmadb %%v16,%%v20,%%v0,%%v16\n\t"
"vfmadb %%v18,%%v21,%%v1,%%v18\n\t"
"vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" "vfmadb %%v17,%%v22,%%v0,%%v17\n\t"
"vfmadb %%v16,%%v21,%%v19,%%v16 \n\t" "vfmadb %%v19,%%v23,%%v1,%%v19\n\t"
"vfmadb %%v17,%%v22,%%v18,%%v17 \n\t"
"vfmadb %%v17,%%v23,%%v19,%%v17 \n\t"
"agfi %%r1,32\n\t" "agfi %%r1,32\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfadb %%v16,%%v16,%%v18\n\t"
"vfadb %%v17,%%v17,%%v19\n\t"
"vpdi %%v18,%%v16,%%v16,4\n\t" "vpdi %%v18,%%v16,%%v16,4\n\t"
"vpdi %%v19,%%v17,%%v17,4\n\t" "vpdi %%v19,%%v17,%%v17,4\n\t"
#if !defined(XCONJ) #if !defined(XCONJ)
"vlrepg %%v20,0(%5) \n\t" "vlrepg %%v20,0(%[alpha])\n\t"
"vleg %%v21,8(%5),0 \n\t" "vleg %%v21,8(%[alpha]),0\n\t"
"wflcdb %%v21,%%v21\n\t" "wflcdb %%v21,%%v21\n\t"
"vleg %%v21,8(%5),1 \n\t" "vleg %%v21,8(%[alpha]),1\n\t"
#else #else
"vleg %%v20,0(%5),1 \n\t" "vleg %%v20,0(%[alpha]),1\n\t"
"vflcdb %%v20,%%v20\n\t" "vflcdb %%v20,%%v20\n\t"
"vleg %%v20,0(%5),0 \n\t" "vleg %%v20,0(%[alpha]),0\n\t"
"vlrepg %%v21,8(%5) \n\t" "vlrepg %%v21,8(%[alpha])\n\t"
#endif #endif
"vl %%v22,0(%4) \n\t" "vl %%v22,0(%[y])\n\t"
"vl %%v23,16(%4) \n\t" "vl %%v23,16(%[y])\n\t"
"vfmadb %%v22,%%v16,%%v20,%%v22\n\t" "vfmadb %%v22,%%v16,%%v20,%%v22\n\t"
"vfmadb %%v22,%%v18,%%v21,%%v22\n\t" "vfmadb %%v22,%%v18,%%v21,%%v22\n\t"
"vfmadb %%v23,%%v17,%%v20,%%v23\n\t" "vfmadb %%v23,%%v17,%%v20,%%v23\n\t"
"vfmadb %%v23,%%v19,%%v21,%%v23\n\t" "vfmadb %%v23,%%v19,%%v21,%%v23\n\t"
"vst %%v22,0(%4) \n\t" "vst %%v22,0(%[y])\n\t"
"vst %%v23,16(%4) \n\t" "vst %%v23,16(%[y])\n\t"
: : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[4])y),"ZQ"((const FLOAT (*)[2])alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23" "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
); "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23");
} }
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
{ FLOAT *alpha) {
__asm__ volatile ( __asm__("vzero %%v16\n\t"
"vzero %%v16 \n\t" "vzero %%v17\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"srlg %%r0,%0,1 \n\t" "srlg %[n],%[n],1\n\t"
"0:\n\t" "0:\n\t"
"pfd 1,1024(%%r1,%1) \n\t" "pfd 1,1024(%%r1,%[ap])\n\t"
"pfd 1,1024(%%r1,%2) \n\t" "pfd 1,1024(%%r1,%[x])\n\t"
"vl %%v0,0(%%r1,%[x])\n\t"
"vl %%v17,0(%%r1,%2) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v18,8(%%r1,%2),0 \n\t" "vleg %%v1,8(%%r1,%[x]),0\n\t"
"wflcdb %%v18,%%v18 \n\t" "wflcdb %%v1,%%v1\n\t"
"vleg %%v18,0(%%r1,%2),1 \n\t" "vleg %%v1,0(%%r1,%[x]),1\n\t"
#else #else
"vleg %%v18,0(%%r1,%2),1 \n\t" "vleg %%v1,0(%%r1,%[x]),1\n\t"
"vflcdb %%v18,%%v18 \n\t" "vflcdb %%v1,%%v1\n\t"
"vleg %%v18,8(%%r1,%2),0 \n\t" "vleg %%v1,8(%%r1,%[x]),0\n\t"
#endif #endif
"vlrepg %%v18,0(%%r1,%[ap])\n\t"
"vlrepg %%v19,0(%%r1,%1) \n\t" "vlrepg %%v19,8(%%r1,%[ap])\n\t"
"vlrepg %%v20,8(%%r1,%1) \n\t" "vfmadb %%v16,%%v18,%%v0,%%v16\n\t"
"vfmadb %%v17,%%v19,%%v1,%%v17\n\t"
"vfmadb %%v16,%%v19,%%v17,%%v16 \n\t" "vl %%v0,16(%%r1,%[x])\n\t"
"vfmadb %%v16,%%v20,%%v18,%%v16 \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
"vleg %%v18,24(%%r1,%2),0 \n\t" "vleg %%v1,24(%%r1,%[x]),0\n\t"
"wflcdb %%v18,%%v18 \n\t" "wflcdb %%v1,%%v1\n\t"
"vleg %%v18,16(%%r1,%2),1 \n\t" "vleg %%v1,16(%%r1,%[x]),1\n\t"
#else #else
"vleg %%v18,16(%%r1,%2),1 \n\t" "vleg %%v1,16(%%r1,%[x]),1\n\t"
"vflcdb %%v18,%%v18 \n\t" "vflcdb %%v1,%%v1\n\t"
"vleg %%v18,24(%%r1,%2),0 \n\t" "vleg %%v1,24(%%r1,%[x]),0\n\t"
#endif #endif
"vlrepg %%v18,16(%%r1,%[ap])\n\t"
"vlrepg %%v19,16(%%r1,%1) \n\t" "vlrepg %%v19,24(%%r1,%[ap])\n\t"
"vlrepg %%v20,24(%%r1,%1) \n\t" "vfmadb %%v16,%%v18,%%v0,%%v16\n\t"
"vfmadb %%v17,%%v19,%%v1,%%v17\n\t"
"vfmadb %%v16,%%v19,%%v17,%%v16 \n\t"
"vfmadb %%v16,%%v20,%%v18,%%v16 \n\t"
"agfi %%r1,32\n\t" "agfi %%r1,32\n\t"
"brctg %%r0,0b \n\t" "brctg %[n],0b\n\t"
"vfadb %%v16,%%v16,%%v17\n\t"
"vpdi %%v17,%%v16,%%v16,4\n\t" "vpdi %%v17,%%v16,%%v16,4\n\t"
#if !defined(XCONJ) #if !defined(XCONJ)
"vlrepg %%v18,0(%4) \n\t" "vlrepg %%v18,0(%[alpha])\n\t"
"vleg %%v19,8(%4),0 \n\t" "vleg %%v19,8(%[alpha]),0\n\t"
"wflcdb %%v19,%%v19\n\t" "wflcdb %%v19,%%v19\n\t"
"vleg %%v19,8(%4),1 \n\t" "vleg %%v19,8(%[alpha]),1\n\t"
#else #else
"vleg %%v18,0(%4),1 \n\t" "vleg %%v18,0(%[alpha]),1\n\t"
"vflcdb %%v18,%%v18\n\t" "vflcdb %%v18,%%v18\n\t"
"vleg %%v18,0(%4),0 \n\t" "vleg %%v18,0(%[alpha]),0\n\t"
"vlrepg %%v19,8(%4) \n\t" "vlrepg %%v19,8(%[alpha])\n\t"
#endif #endif
"vl %%v20,0(%3) \n\t" "vl %%v0,0(%[y])\n\t"
"vfmadb %%v20,%%v16,%%v18,%%v20 \n\t" "vfmadb %%v0,%%v16,%%v18,%%v0\n\t"
"vfmadb %%v20,%%v17,%%v19,%%v20 \n\t" "vfmadb %%v0,%%v17,%%v19,%%v0\n\t"
"vst %%v20,0(%3) \n\t" "vst %%v0,0(%[y])\n\t"
: : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[2])y),"ZQ"((const FLOAT (*)[2])alpha) : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
:"memory","cc","r0","r1","v16","v17","v18","v19","v20" "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
); "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19");
} }
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
{
BLASLONG i; BLASLONG i;
for ( i=0; i<n; i++ ) for (i = 0; i < n; i++) {
{
*dest = *src; *dest = *src;
*(dest + 1) = *(src + 1); *(dest + 1) = *(src + 1);
dest += 2; dest += 2;
@ -312,8 +311,9 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
} }
} }
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
{ FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i; BLASLONG i;
BLASLONG j; BLASLONG j;
FLOAT *a_ptr; FLOAT *a_ptr;
@ -329,8 +329,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT ybuffer[8], *xbuffer; FLOAT ybuffer[8], *xbuffer;
FLOAT alpha[2]; FLOAT alpha[2];
if ( m < 1 ) return(0); if (m < 1)
if ( n < 1 ) return(0); return (0);
if (n < 1)
return (0);
inc_x <<= 1; inc_x <<= 1;
inc_y <<= 1; inc_y <<= 1;
@ -351,13 +353,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
BLASLONG NB = NBMAX; BLASLONG NB = NBMAX;
while ( NB == NBMAX ) while (NB == NBMAX) {
{
m1 -= NB; m1 -= NB;
if ( m1 < 0) if (m1 < 0) {
{ if (m2 == 0)
if ( m2 == 0 ) break; break;
NB = m2; NB = m2;
} }
@ -373,11 +374,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
else else
xbuffer = x_ptr; xbuffer = x_ptr;
if ( inc_y == 2 ) if (inc_y == 2) {
{
for( i = 0; i < n1 ; i++) for (i = 0; i < n1; i++) {
{
zgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha); zgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha);
ap[0] += lda4; ap[0] += lda4;
ap[1] += lda4; ap[1] += lda4;
@ -388,28 +387,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
} }
if ( n2 & 2 ) if (n2 & 2) {
{
zgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha); zgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha);
a_ptr += lda * 2; a_ptr += lda * 2;
y_ptr += 4; y_ptr += 4;
} }
if ( n2 & 1 ) if (n2 & 1) {
{
zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
/* a_ptr += lda; /* a_ptr += lda;
y_ptr += 2; */ y_ptr += 2; */
} }
} } else {
else
{
for( i = 0; i < n1 ; i++) for (i = 0; i < n1; i++) {
{
memset(ybuffer, 0, sizeof(ybuffer)); memset(ybuffer, 0, sizeof(ybuffer));
zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha); zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha);
ap[0] += lda4; ap[0] += lda4;
@ -433,8 +427,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
} }
for( i = 0; i < n2 ; i++) for (i = 0; i < n2; i++) {
{
memset(ybuffer, 0, sizeof(ybuffer)); memset(ybuffer, 0, sizeof(ybuffer));
zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha); zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha);
a_ptr += lda; a_ptr += lda;
@ -449,17 +442,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
x += NB * inc_x; x += NB * inc_x;
} }
if (m3 == 0)
return (0);
if ( m3 == 0 ) return(0);
x_ptr = x; x_ptr = x;
j = 0; j = 0;
a_ptr = a; a_ptr = a;
y_ptr = y; y_ptr = y;
if ( m3 == 3 ) if (m3 == 3) {
{
FLOAT temp_r; FLOAT temp_r;
FLOAT temp_i; FLOAT temp_i;
@ -471,8 +462,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
x_ptr += inc_x; x_ptr += inc_x;
FLOAT x4 = x_ptr[0]; FLOAT x4 = x_ptr[0];
FLOAT x5 = x_ptr[1]; FLOAT x5 = x_ptr[1];
while ( j < n) while (j < n) {
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -505,9 +495,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
return (0); return (0);
} }
if (m3 == 2) {
if ( m3 == 2 )
{
FLOAT temp_r; FLOAT temp_r;
FLOAT temp_i; FLOAT temp_i;
@ -521,8 +509,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT ar = alpha[0]; FLOAT ar = alpha[0];
FLOAT ai = alpha[1]; FLOAT ai = alpha[1];
while ( j < ( n & -2 )) while (j < (n & -2)) {
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -565,9 +552,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
j += 2; j += 2;
} }
while (j < n) {
while ( j < n)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -597,9 +582,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
return (0); return (0);
} }
if (m3 == 1) {
if ( m3 == 1 )
{
FLOAT temp_r; FLOAT temp_r;
FLOAT temp_i; FLOAT temp_i;
@ -610,8 +593,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
FLOAT ar = alpha[0]; FLOAT ar = alpha[0];
FLOAT ai = alpha[1]; FLOAT ai = alpha[1];
while ( j < ( n & -2 )) while (j < (n & -2)) {
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
@ -646,8 +628,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
j += 2; j += 2;
} }
while ( j < n) while (j < n) {
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,25 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
{ __asm__("vlrepg %%v0,%[c]\n\t"
__asm__ ( "vlrepg %%v1,%[s]\n\t"
"vlrepg %%v0,%3 \n\t" "srlg %[n],%[n],4\n\t"
"vlrepg %%v1,%4 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v24, 0(%%r1,%1) \n\t" "vl %%v24, 0(%%r1,%[x])\n\t"
"vl %%v25, 16(%%r1,%1) \n\t" "vl %%v25, 16(%%r1,%[x])\n\t"
"vl %%v26, 32(%%r1,%1) \n\t" "vl %%v26, 32(%%r1,%[x])\n\t"
"vl %%v27, 48(%%r1,%1) \n\t" "vl %%v27, 48(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%2) \n\t" "vl %%v16, 0(%%r1,%[y])\n\t"
"vl %%v17, 16(%%r1,%2) \n\t" "vl %%v17, 16(%%r1,%[y])\n\t"
"vl %%v18, 32(%%r1,%2) \n\t" "vl %%v18, 32(%%r1,%[y])\n\t"
"vl %%v19, 48(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%[y])\n\t"
"vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -63,25 +60,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%[x])\n\t"
"vst %%v28, 0(%%r1,%1) \n\t" "vst %%v29, 16(%%r1,%[x])\n\t"
"vst %%v29, 16(%%r1,%1) \n\t" "vst %%v30, 32(%%r1,%[x])\n\t"
"vst %%v30, 32(%%r1,%1) \n\t" "vst %%v31, 48(%%r1,%[x])\n\t"
"vst %%v31, 48(%%r1,%1) \n\t" "vst %%v20, 0(%%r1,%[y])\n\t"
"vst %%v20, 0(%%r1,%2) \n\t" "vst %%v21, 16(%%r1,%[y])\n\t"
"vst %%v21, 16(%%r1,%2) \n\t" "vst %%v22, 32(%%r1,%[y])\n\t"
"vst %%v22, 32(%%r1,%2) \n\t" "vst %%v23, 48(%%r1,%[y])\n\t"
"vst %%v23, 48(%%r1,%2) \n\t" "vl %%v24, 64(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%[x])\n\t"
"vl %%v24, 64(%%r1,%1) \n\t" "vl %%v26, 96(%%r1,%[x])\n\t"
"vl %%v25, 80(%%r1,%1) \n\t" "vl %%v27, 112(%%r1,%[x])\n\t"
"vl %%v26, 96(%%r1,%1) \n\t" "vl %%v16, 64(%%r1,%[y])\n\t"
"vl %%v27, 112(%%r1,%1) \n\t" "vl %%v17, 80(%%r1,%[y])\n\t"
"vl %%v16, 64(%%r1,%2) \n\t" "vl %%v18, 96(%%r1,%[y])\n\t"
"vl %%v17, 80(%%r1,%2) \n\t" "vl %%v19, 112(%%r1,%[y])\n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -99,25 +93,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%[x])\n\t"
"vst %%v28, 64(%%r1,%1) \n\t" "vst %%v29, 80(%%r1,%[x])\n\t"
"vst %%v29, 80(%%r1,%1) \n\t" "vst %%v30, 96(%%r1,%[x])\n\t"
"vst %%v30, 96(%%r1,%1) \n\t" "vst %%v31, 112(%%r1,%[x])\n\t"
"vst %%v31, 112(%%r1,%1) \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vl %%v16, 128(%%r1,%[y])\n\t"
"vl %%v27, 176(%%r1,%1) \n\t" "vl %%v17, 144(%%r1,%[y])\n\t"
"vl %%v16, 128(%%r1,%2) \n\t" "vl %%v18, 160(%%r1,%[y])\n\t"
"vl %%v17, 144(%%r1,%2) \n\t" "vl %%v19, 176(%%r1,%[y])\n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -135,25 +126,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%[x])\n\t"
"vst %%v28, 128(%%r1,%1) \n\t" "vst %%v29, 144(%%r1,%[x])\n\t"
"vst %%v29, 144(%%r1,%1) \n\t" "vst %%v30, 160(%%r1,%[x])\n\t"
"vst %%v30, 160(%%r1,%1) \n\t" "vst %%v31, 176(%%r1,%[x])\n\t"
"vst %%v31, 176(%%r1,%1) \n\t" "vst %%v20, 128(%%r1,%[y])\n\t"
"vst %%v20, 128(%%r1,%2) \n\t" "vst %%v21, 144(%%r1,%[y])\n\t"
"vst %%v21, 144(%%r1,%2) \n\t" "vst %%v22, 160(%%r1,%[y])\n\t"
"vst %%v22, 160(%%r1,%2) \n\t" "vst %%v23, 176(%%r1,%[y])\n\t"
"vst %%v23, 176(%%r1,%2) \n\t" "vl %%v24, 192(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%[x])\n\t"
"vl %%v24, 192(%%r1,%1) \n\t" "vl %%v26, 224(%%r1,%[x])\n\t"
"vl %%v25, 208(%%r1,%1) \n\t" "vl %%v27, 240(%%r1,%[x])\n\t"
"vl %%v26, 224(%%r1,%1) \n\t" "vl %%v16, 192(%%r1,%[y])\n\t"
"vl %%v27, 240(%%r1,%1) \n\t" "vl %%v17, 208(%%r1,%[y])\n\t"
"vl %%v16, 192(%%r1,%2) \n\t" "vl %%v18, 224(%%r1,%[y])\n\t"
"vl %%v17, 208(%%r1,%2) \n\t" "vl %%v19, 240(%%r1,%[y])\n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v28,%%v24,%%v0\n\t"
"vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t"
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
@ -171,40 +159,39 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%[x])\n\t"
"vst %%v28, 192(%%r1,%1) \n\t" "vst %%v29, 208(%%r1,%[x])\n\t"
"vst %%v29, 208(%%r1,%1) \n\t" "vst %%v30, 224(%%r1,%[x])\n\t"
"vst %%v30, 224(%%r1,%1) \n\t" "vst %%v31, 240(%%r1,%[x])\n\t"
"vst %%v31, 240(%%r1,%1) \n\t" "vst %%v20, 192(%%r1,%[y])\n\t"
"vst %%v20, 192(%%r1,%2) \n\t" "vst %%v21, 208(%%r1,%[y])\n\t"
"vst %%v21, 208(%%r1,%2) \n\t" "vst %%v22, 224(%%r1,%[y])\n\t"
"vst %%v22, 224(%%r1,%2) \n\t" "vst %%v23, 240(%%r1,%[y])\n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) x),
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
); : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
{ FLOAT c, FLOAT s) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
FLOAT temp[2]; FLOAT temp[2];
BLASLONG inc_x2; BLASLONG inc_x2;
BLASLONG inc_y2; BLASLONG inc_y2;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1) ) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if (n1 > 0) {
{
FLOAT cosa, sina; FLOAT cosa, sina;
cosa = c; cosa = c;
sina = s; sina = s;
@ -213,8 +200,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
ix = 2 * n1; ix = 2 * n1;
} }
while(i < n) while (i < n) {
{
temp[0] = c * x[ix] + s * y[ix]; temp[0] = c * x[ix] + s * y[ix];
temp[1] = c * x[ix + 1] + s * y[ix + 1]; temp[1] = c * x[ix + 1] + s * y[ix + 1];
y[ix] = c * y[ix] - s * x[ix]; y[ix] = c * y[ix] - s * x[ix];
@ -227,14 +213,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
} }
} else {
}
else
{
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y; inc_y2 = 2 * inc_y;
while(i < n) while (i < n) {
{
temp[0] = c * x[ix] + s * y[iy]; temp[0] = c * x[ix] + s * y[iy];
temp[1] = c * x[ix + 1] + s * y[iy + 1]; temp[1] = c * x[ix + 1] + s * y[iy + 1];
y[iy] = c * y[iy] - s * x[ix]; y[iy] = c * y[iy] - s * x[ix];
@ -252,5 +234,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
return (0); return (0);
} }

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013 - 2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,26 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) {
{ __asm__("vlrepg %%v0,0(%[alpha])\n\t"
__asm__ volatile( "vleg %%v1,8(%[alpha]),0\n\t"
"vlrepg %%v0,0(%1) \n\t"
"vleg %%v1,8(%1),0 \n\t"
"wflcdb %%v1,%%v1\n\t" "wflcdb %%v1,%%v1\n\t"
"vleg %%v1,8(%1),1 \n\t" "vleg %%v1,8(%[alpha]),1\n\t"
"srlg %%r0,%0,3 \n\t" "srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t"
"vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t"
"vpdi %%v26,%%v18,%%v18,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t"
@ -55,7 +52,6 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vpdi %%v29,%%v21,%%v21,4\n\t" "vpdi %%v29,%%v21,%%v21,4\n\t"
"vpdi %%v30,%%v22,%%v22,4\n\t" "vpdi %%v30,%%v22,%%v22,4\n\t"
"vpdi %%v31,%%v23,%%v23,4\n\t" "vpdi %%v31,%%v23,%%v23,4\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t" "vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t" "vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t" "vfmdb %%v18,%%v18,%%v0\n\t"
@ -72,43 +68,40 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vfmadb %%v21,%%v29,%%v1,%%v21\n\t" "vfmadb %%v21,%%v29,%%v1,%%v21\n\t"
"vfmadb %%v22,%%v30,%%v1,%%v22\n\t" "vfmadb %%v22,%%v30,%%v1,%%v22\n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%2) \n\t" "vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%2) \n\t" "vst %%v23,112(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" [alpha] "a"(alpha)
); : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31");
} }
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
{ __asm__("vleg %%v0,8(%[alpha]),0\n\t"
__asm__ volatile(
"vleg %%v0,8(%1),0 \n\t"
"wflcdb %%v0,%%v0\n\t" "wflcdb %%v0,%%v0\n\t"
"vleg %%v0,8(%1),1 \n\t" "vleg %%v0,8(%[alpha]),1\n\t"
"srlg %%r0,%0,3 \n\t" "srlg %[n],%[n],3\n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v16,%%v16,%%v16,4\n\t" "vpdi %%v16,%%v16,%%v16,4\n\t"
"vpdi %%v17,%%v17,%%v17,4\n\t" "vpdi %%v17,%%v17,%%v17,4\n\t"
"vpdi %%v18,%%v18,%%v18,4\n\t" "vpdi %%v18,%%v18,%%v18,4\n\t"
@ -117,7 +110,6 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vpdi %%v21,%%v21,%%v21,4\n\t" "vpdi %%v21,%%v21,%%v21,4\n\t"
"vpdi %%v22,%%v22,%%v22,4\n\t" "vpdi %%v22,%%v22,%%v22,4\n\t"
"vpdi %%v23,%%v23,%%v23,4\n\t" "vpdi %%v23,%%v23,%%v23,4\n\t"
"vfmdb %%v16,%%v16,%%v0\n\t" "vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t" "vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t" "vfmdb %%v18,%%v18,%%v0\n\t"
@ -126,42 +118,37 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%2) \n\t" "vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%2) \n\t" "vst %%v23,112(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" [alpha] "a"(alpha)
); : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
} }
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
{ __asm__("vlrepg %%v0,0(%[alpha])\n\t"
__asm__ volatile( "srlg %[n],%[n],3\n\t"
"vlrepg %%v0,0(%1) \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%[x])\n\t"
"vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%[x])\n\t"
"vl %%v17,16(%%r1,%2) \n\t" "vl %%v18,32(%%r1,%[x])\n\t"
"vl %%v18,32(%%r1,%2) \n\t" "vl %%v19,48(%%r1,%[x])\n\t"
"vl %%v19,48(%%r1,%2) \n\t" "vl %%v20,64(%%r1,%[x])\n\t"
"vl %%v20,64(%%r1,%2) \n\t" "vl %%v21,80(%%r1,%[x])\n\t"
"vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%[x])\n\t"
"vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%[x])\n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfmdb %%v16,%%v16,%%v0\n\t" "vfmdb %%v16,%%v16,%%v0\n\t"
"vfmdb %%v17,%%v17,%%v0\n\t" "vfmdb %%v17,%%v17,%%v0\n\t"
"vfmdb %%v18,%%v18,%%v0\n\t" "vfmdb %%v18,%%v18,%%v0\n\t"
@ -170,55 +157,46 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
"vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v21,%%v21,%%v0\n\t"
"vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t"
"vfmdb %%v23,%%v23,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t"
"vst %%v16,0(%%r1,%[x])\n\t"
"vst %%v16,0(%%r1,%2) \n\t" "vst %%v17,16(%%r1,%[x])\n\t"
"vst %%v17,16(%%r1,%2) \n\t" "vst %%v18,32(%%r1,%[x])\n\t"
"vst %%v18,32(%%r1,%2) \n\t" "vst %%v19,48(%%r1,%[x])\n\t"
"vst %%v19,48(%%r1,%2) \n\t" "vst %%v20,64(%%r1,%[x])\n\t"
"vst %%v20,64(%%r1,%2) \n\t" "vst %%v21,80(%%r1,%[x])\n\t"
"vst %%v21,80(%%r1,%2) \n\t" "vst %%v22,96(%%r1,%[x])\n\t"
"vst %%v22,96(%%r1,%2) \n\t" "vst %%v23,112(%%r1,%[x])\n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" [alpha] "a"(alpha)
); : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
"v23");
} }
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
{ __asm__("vzero %%v0\n\t"
__asm__ volatile( "srlg %[n],%[n],3\n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"vst %%v0,0(%%r1,%[x])\n\t"
"vst %%v24,0(%%r1,%1) \n\t" "vst %%v0,16(%%r1,%[x])\n\t"
"vst %%v25,16(%%r1,%1) \n\t" "vst %%v0,32(%%r1,%[x])\n\t"
"vst %%v26,32(%%r1,%1) \n\t" "vst %%v0,48(%%r1,%[x])\n\t"
"vst %%v27,48(%%r1,%1) \n\t" "vst %%v0,64(%%r1,%[x])\n\t"
"vst %%v24,64(%%r1,%1) \n\t" "vst %%v0,80(%%r1,%[x])\n\t"
"vst %%v25,80(%%r1,%1) \n\t" "vst %%v0,96(%%r1,%[x])\n\t"
"vst %%v26,96(%%r1,%1) \n\t" "vst %%v0,112(%%r1,%[x])\n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128\n\t" "agfi %%r1,128\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
:"r"(n),"ZR"((FLOAT (*)[n * 2])x) : [x] "a"(x)
:"memory","cc","r0","r1","v24","v25","v26","v27" : "cc", "r1", "v0");
);
} }
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
{ BLASLONG inc_x) {
BLASLONG i; BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x; BLASLONG inc_x3 = inc_x2 + inc_x;
@ -226,8 +204,7 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
FLOAT da_r = alpha[0]; FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1]; FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4) for (i = 0; i < n; i += 4) {
{
t0 = da_r * x[0] - da_i * x[1]; t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
@ -247,7 +224,9 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
} }
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2) {
BLASLONG i = 0, j = 0; BLASLONG i = 0, j = 0;
FLOAT temp0; FLOAT temp0;
FLOAT temp1; FLOAT temp1;
@ -307,13 +286,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
} }
} }
} else { } else {
if (da_i == 0.0) { if (da_i == 0.0) {
BLASLONG n1 = n & -2; BLASLONG n1 = n & -2;
@ -368,7 +344,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
return (0); return (0);
} }
BLASLONG n1 = n & -8; BLASLONG n1 = n & -8;
if (n1 > 0) { if (n1 > 0) {
@ -380,8 +355,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
zscal_kernel_8_zero(n1, x); zscal_kernel_8_zero(n1, x);
else else
zscal_kernel_8_zero_r(n1, alpha, x); zscal_kernel_8_zero_r(n1, alpha, x);
else else if (da_i == 0)
if (da_i == 0)
zscal_kernel_8_zero_i(n1, alpha, x); zscal_kernel_8_zero_i(n1, alpha, x);
else else
zscal_kernel_8(n1, alpha, x); zscal_kernel_8(n1, alpha, x);
@ -390,7 +364,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
j = n1; j = n1;
} }
if (da_r == 0.0) { if (da_r == 0.0) {
if (da_i == 0.0) { if (da_i == 0.0) {

View File

@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are modification, are permitted provided that the following conditions are
@ -27,114 +27,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
{ __asm__("srlg %[n],%[n],4\n\t"
__asm__ volatile(
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1\n\t" "xgr %%r1,%%r1\n\t"
"0:\n\t" "0:\n\t"
"pfd 2, 1024(%%r1,%1) \n\t" "pfd 2, 1024(%%r1,%[x])\n\t"
"pfd 2, 1024(%%r1,%2) \n\t" "pfd 2, 1024(%%r1,%[y])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%1) \n\t" "vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%1) \n\t" "vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%1) \n\t" "vl %%v23, 112(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%1) \n\t" "vl %%v24, 128(%%r1,%[x])\n\t"
"vl %%v24, 128(%%r1,%1) \n\t" "vl %%v25, 144(%%r1,%[x])\n\t"
"vl %%v25, 144(%%r1,%1) \n\t" "vl %%v26, 160(%%r1,%[x])\n\t"
"vl %%v26, 160(%%r1,%1) \n\t" "vl %%v27, 176(%%r1,%[x])\n\t"
"vl %%v27, 176(%%r1,%1) \n\t" "vl %%v28, 192(%%r1,%[x])\n\t"
"vl %%v28, 192(%%r1,%1) \n\t" "vl %%v29, 208(%%r1,%[x])\n\t"
"vl %%v29, 208(%%r1,%1) \n\t" "vl %%v30, 224(%%r1,%[x])\n\t"
"vl %%v30, 224(%%r1,%1) \n\t" "vl %%v31, 240(%%r1,%[x])\n\t"
"vl %%v31, 240(%%r1,%1) \n\t" "vl %%v0, 0(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%[y])\n\t"
"vl %%v0, 0(%%r1,%2) \n\t" "vl %%v2, 32(%%r1,%[y])\n\t"
"vl %%v1, 16(%%r1,%2) \n\t" "vl %%v3, 48(%%r1,%[y])\n\t"
"vl %%v2, 32(%%r1,%2) \n\t" "vl %%v4, 64(%%r1,%[y])\n\t"
"vl %%v3, 48(%%r1,%2) \n\t" "vl %%v5, 80(%%r1,%[y])\n\t"
"vl %%v4, 64(%%r1,%2) \n\t" "vl %%v6, 96(%%r1,%[y])\n\t"
"vl %%v5, 80(%%r1,%2) \n\t" "vl %%v7, 112(%%r1,%[y])\n\t"
"vl %%v6, 96(%%r1,%2) \n\t" "vst %%v0, 0(%%r1,%[x])\n\t"
"vl %%v7, 112(%%r1,%2) \n\t" "vst %%v1, 16(%%r1,%[x])\n\t"
"vst %%v0, 0(%%r1,%1) \n\t" "vst %%v2, 32(%%r1,%[x])\n\t"
"vst %%v1, 16(%%r1,%1) \n\t" "vst %%v3, 48(%%r1,%[x])\n\t"
"vst %%v2, 32(%%r1,%1) \n\t" "vst %%v4, 64(%%r1,%[x])\n\t"
"vst %%v3, 48(%%r1,%1) \n\t" "vst %%v5, 80(%%r1,%[x])\n\t"
"vst %%v4, 64(%%r1,%1) \n\t" "vst %%v6, 96(%%r1,%[x])\n\t"
"vst %%v5, 80(%%r1,%1) \n\t" "vst %%v7, 112(%%r1,%[x])\n\t"
"vst %%v6, 96(%%r1,%1) \n\t" "vl %%v0, 128(%%r1,%[y])\n\t"
"vst %%v7, 112(%%r1,%1) \n\t" "vl %%v1, 144(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%[y])\n\t"
"vl %%v0, 128(%%r1,%2) \n\t" "vl %%v3, 176(%%r1,%[y])\n\t"
"vl %%v1, 144(%%r1,%2) \n\t" "vl %%v4, 192(%%r1,%[y])\n\t"
"vl %%v2, 160(%%r1,%2) \n\t" "vl %%v5, 208(%%r1,%[y])\n\t"
"vl %%v3, 176(%%r1,%2) \n\t" "vl %%v6, 224(%%r1,%[y])\n\t"
"vl %%v4, 192(%%r1,%2) \n\t" "vl %%v7, 240(%%r1,%[y])\n\t"
"vl %%v5, 208(%%r1,%2) \n\t" "vst %%v0, 128(%%r1,%[x])\n\t"
"vl %%v6, 224(%%r1,%2) \n\t" "vst %%v1, 144(%%r1,%[x])\n\t"
"vl %%v7, 240(%%r1,%2) \n\t" "vst %%v2, 160(%%r1,%[x])\n\t"
"vst %%v0, 128(%%r1,%1) \n\t" "vst %%v3, 176(%%r1,%[x])\n\t"
"vst %%v1, 144(%%r1,%1) \n\t" "vst %%v4, 192(%%r1,%[x])\n\t"
"vst %%v2, 160(%%r1,%1) \n\t" "vst %%v5, 208(%%r1,%[x])\n\t"
"vst %%v3, 176(%%r1,%1) \n\t" "vst %%v6, 224(%%r1,%[x])\n\t"
"vst %%v4, 192(%%r1,%1) \n\t" "vst %%v7, 240(%%r1,%[x])\n\t"
"vst %%v5, 208(%%r1,%1) \n\t" "vst %%v16, 0(%%r1,%[y])\n\t"
"vst %%v6, 224(%%r1,%1) \n\t" "vst %%v17, 16(%%r1,%[y])\n\t"
"vst %%v7, 240(%%r1,%1) \n\t" "vst %%v18, 32(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%[y])\n\t"
"vst %%v16, 0(%%r1,%2) \n\t" "vst %%v20, 64(%%r1,%[y])\n\t"
"vst %%v17, 16(%%r1,%2) \n\t" "vst %%v21, 80(%%r1,%[y])\n\t"
"vst %%v18, 32(%%r1,%2) \n\t" "vst %%v22, 96(%%r1,%[y])\n\t"
"vst %%v19, 48(%%r1,%2) \n\t" "vst %%v23, 112(%%r1,%[y])\n\t"
"vst %%v20, 64(%%r1,%2) \n\t" "vst %%v24, 128(%%r1,%[y])\n\t"
"vst %%v21, 80(%%r1,%2) \n\t" "vst %%v25, 144(%%r1,%[y])\n\t"
"vst %%v22, 96(%%r1,%2) \n\t" "vst %%v26, 160(%%r1,%[y])\n\t"
"vst %%v23, 112(%%r1,%2) \n\t" "vst %%v27, 176(%%r1,%[y])\n\t"
"vst %%v24, 128(%%r1,%2) \n\t" "vst %%v28, 192(%%r1,%[y])\n\t"
"vst %%v25, 144(%%r1,%2) \n\t" "vst %%v29, 208(%%r1,%[y])\n\t"
"vst %%v26, 160(%%r1,%2) \n\t" "vst %%v30, 224(%%r1,%[y])\n\t"
"vst %%v27, 176(%%r1,%2) \n\t" "vst %%v31, 240(%%r1,%[y])\n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256\n\t" "agfi %%r1,256\n\t"
"brctg %%r0,0b " "brctg %[n],0b"
: : "+m"(*(struct { FLOAT x[n * 2]; } *) x),
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : [x] "a"(x),[y] "a"(y)
); : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
} }
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
{ FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG ix = 0, iy = 0; BLASLONG ix = 0, iy = 0;
FLOAT temp[2]; FLOAT temp[2];
BLASLONG inc_x2, inc_y2; BLASLONG inc_x2, inc_y2;
if ( n <= 0 ) return(0); if (n <= 0)
return (0);
if ( (inc_x == 1) && (inc_y == 1 )) if ((inc_x == 1) && (inc_y == 1)) {
{
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 > 0 ) if (n1 > 0) {
{
zswap_kernel_16(n1, x, y); zswap_kernel_16(n1, x, y);
i = n1; i = n1;
ix = 2 * n1; ix = 2 * n1;
iy = 2 * n1; iy = 2 * n1;
} }
while(i < n) while (i < n) {
{
temp[0] = x[ix]; temp[0] = x[ix];
temp[1] = x[ix + 1]; temp[1] = x[ix + 1];
@ -147,19 +141,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
iy += 2; iy += 2;
i++; i++;
} }
} else {
}
else
{
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y; inc_y2 = 2 * inc_y;
while(i < n) while (i < n) {
{
temp[0] = x[ix]; temp[0] = x[ix];
temp[1] = x[ix + 1]; temp[1] = x[ix + 1];
@ -177,7 +166,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
} }
return (0); return (0);
} }