commit
76bb74fcd4
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,27 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
|
|
||||||
static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
|
|
||||||
{
|
|
||||||
FLOAT amax;
|
FLOAT amax;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vlef %%v0,0(%[x]),0\n\t"
|
||||||
"vlef %%v0,0(%2),0 \n\t"
|
"vlef %%v16,4(%[x]),0\n\t"
|
||||||
"vlef %%v16,4(%2),0 \n\t"
|
"vlef %%v0,8(%[x]),1\n\t"
|
||||||
"vlef %%v0,8(%2),1 \n\t"
|
"vlef %%v16,12(%[x]),1\n\t"
|
||||||
"vlef %%v16,12(%2),1 \n\t"
|
"vlef %%v0,16(%[x]),2\n\t"
|
||||||
"vlef %%v0,16(%2),2 \n\t"
|
"vlef %%v16,20(%[x]),2\n\t"
|
||||||
"vlef %%v16,20(%2),2 \n\t"
|
"vlef %%v0,24(%[x]),3\n\t"
|
||||||
"vlef %%v0,24(%2),3 \n\t"
|
"vlef %%v16,28(%[x]),3\n\t"
|
||||||
"vlef %%v16,28(%2),3 \n\t"
|
|
||||||
"vflpsb %%v0,%%v0\n\t"
|
"vflpsb %%v0,%%v0\n\t"
|
||||||
"vflpsb %%v16,%%v16\n\t"
|
"vflpsb %%v16,%%v16\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v16\n\t"
|
"vfasb %%v0,%%v0,%%v16\n\t"
|
||||||
|
|
@ -68,51 +60,42 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vleib %%v1,25,13\n\t"
|
"vleib %%v1,25,13\n\t"
|
||||||
"vleib %%v1,26,14\n\t"
|
"vleib %%v1,26,14\n\t"
|
||||||
"vleib %%v1,27,15\n\t"
|
"vleib %%v1,27,15\n\t"
|
||||||
"srlg %%r0,%1,5 \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v2,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,16(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v17,%%v16,%%v2\n\t"
|
"vpkg %%v17,%%v16,%%v2\n\t"
|
||||||
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
|
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v2,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,48(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v19,%%v18,%%v2\n\t"
|
"vpkg %%v19,%%v18,%%v2\n\t"
|
||||||
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
|
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v2,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,80(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v21,%%v20,%%v2\n\t"
|
"vpkg %%v21,%%v20,%%v2\n\t"
|
||||||
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
|
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v2,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,112(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v23,%%v22,%%v2\n\t"
|
"vpkg %%v23,%%v22,%%v2\n\t"
|
||||||
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
|
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v24,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24,128(%%r1,%2) \n\t"
|
"vl %%v2,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,144(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v25,%%v24,%%v2\n\t"
|
"vpkg %%v25,%%v24,%%v2\n\t"
|
||||||
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
|
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v26,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26,160(%%r1,%2) \n\t"
|
"vl %%v2,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,176(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v27,%%v26,%%v2\n\t"
|
"vpkg %%v27,%%v26,%%v2\n\t"
|
||||||
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
|
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v28,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,192(%%r1,%2) \n\t"
|
"vl %%v2,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,208(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v29,%%v28,%%v2\n\t"
|
"vpkg %%v29,%%v28,%%v2\n\t"
|
||||||
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
|
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v30,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,224(%%r1,%2) \n\t"
|
"vl %%v2,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,240(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v31,%%v30,%%v2\n\t"
|
"vpkg %%v31,%%v30,%%v2\n\t"
|
||||||
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
|
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
|
||||||
|
|
||||||
"vflpsb %%v16,%%v16\n\t"
|
"vflpsb %%v16,%%v16\n\t"
|
||||||
"vflpsb %%v17,%%v17\n\t"
|
"vflpsb %%v17,%%v17\n\t"
|
||||||
"vflpsb %%v18,%%v18\n\t"
|
"vflpsb %%v18,%%v18\n\t"
|
||||||
|
|
@ -129,7 +112,6 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vflpsb %%v29,%%v29\n\t"
|
"vflpsb %%v29,%%v29\n\t"
|
||||||
"vflpsb %%v30,%%v30\n\t"
|
"vflpsb %%v30,%%v30\n\t"
|
||||||
"vflpsb %%v31,%%v31\n\t"
|
"vflpsb %%v31,%%v31\n\t"
|
||||||
|
|
||||||
"vfasb %%v16,%%v16,%%v17\n\t"
|
"vfasb %%v16,%%v16,%%v17\n\t"
|
||||||
"vfasb %%v18,%%v18,%%v19\n\t"
|
"vfasb %%v18,%%v18,%%v19\n\t"
|
||||||
"vfasb %%v20,%%v20,%%v21\n\t"
|
"vfasb %%v20,%%v20,%%v21\n\t"
|
||||||
|
|
@ -138,32 +120,26 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vfasb %%v26,%%v26,%%v27\n\t"
|
"vfasb %%v26,%%v26,%%v27\n\t"
|
||||||
"vfasb %%v28,%%v28,%%v29\n\t"
|
"vfasb %%v28,%%v28,%%v29\n\t"
|
||||||
"vfasb %%v30,%%v30,%%v31\n\t"
|
"vfasb %%v30,%%v30,%%v31\n\t"
|
||||||
|
|
||||||
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
|
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
|
||||||
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
|
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
|
||||||
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
|
"vfmaxsb %%v20,%%v20,%%v28,0\n\t"
|
||||||
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
|
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
|
||||||
|
|
||||||
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
|
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
|
||||||
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
|
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
|
||||||
|
|
||||||
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
|
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
|
||||||
|
|
||||||
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"veslg %%v16,%%v0,32\n\t"
|
"veslg %%v16,%%v0,32\n\t"
|
||||||
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
||||||
|
|
||||||
"vrepf %%v16,%%v0,2\n\t"
|
"vrepf %%v16,%%v0,2\n\t"
|
||||||
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
||||||
"ler %0,%%f0 "
|
"ler %[amax],%%f0"
|
||||||
:"=f"(amax)
|
: [amax] "=f"(amax),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
|
||||||
);
|
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
|
|
||||||
return amax;
|
return amax;
|
||||||
}
|
}
|
||||||
|
|
@ -174,7 +150,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT maxf = 0.0;
|
FLOAT maxf = 0.0;
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (maxf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -184,9 +161,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
maxf = camax_kernel_32(n1, x);
|
maxf = camax_kernel_32(n1, x);
|
||||||
ix = n1 * 2;
|
ix = n1 * 2;
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = CABS1(x, 0);
|
maxf = CABS1(x, 0);
|
||||||
ix += 2;
|
ix += 2;
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -228,7 +203,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (i < n) {
|
while (i < n) {
|
||||||
if (CABS1(x, ix) > maxf) {
|
if (CABS1(x, ix) > maxf) {
|
||||||
maxf = CABS1(x, ix);
|
maxf = CABS1(x, ix);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,27 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
|
|
||||||
static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
|
|
||||||
{
|
|
||||||
FLOAT amin;
|
FLOAT amin;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vlef %%v0,0(%[x]),0\n\t"
|
||||||
"vlef %%v0,0(%2),0 \n\t"
|
"vlef %%v16,4(%[x]),0\n\t"
|
||||||
"vlef %%v16,4(%2),0 \n\t"
|
"vlef %%v0,8(%[x]),1\n\t"
|
||||||
"vlef %%v0,8(%2),1 \n\t"
|
"vlef %%v16,12(%[x]),1\n\t"
|
||||||
"vlef %%v16,12(%2),1 \n\t"
|
"vlef %%v0,16(%[x]),2\n\t"
|
||||||
"vlef %%v0,16(%2),2 \n\t"
|
"vlef %%v16,20(%[x]),2\n\t"
|
||||||
"vlef %%v16,20(%2),2 \n\t"
|
"vlef %%v0,24(%[x]),3\n\t"
|
||||||
"vlef %%v0,24(%2),3 \n\t"
|
"vlef %%v16,28(%[x]),3\n\t"
|
||||||
"vlef %%v16,28(%2),3 \n\t"
|
|
||||||
"vflpsb %%v0,%%v0\n\t"
|
"vflpsb %%v0,%%v0\n\t"
|
||||||
"vflpsb %%v16,%%v16\n\t"
|
"vflpsb %%v16,%%v16\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v16\n\t"
|
"vfasb %%v0,%%v0,%%v16\n\t"
|
||||||
|
|
@ -68,51 +60,42 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vleib %%v1,25,13\n\t"
|
"vleib %%v1,25,13\n\t"
|
||||||
"vleib %%v1,26,14\n\t"
|
"vleib %%v1,26,14\n\t"
|
||||||
"vleib %%v1,27,15\n\t"
|
"vleib %%v1,27,15\n\t"
|
||||||
"srlg %%r0,%1,5 \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v2,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,16(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v17,%%v16,%%v2\n\t"
|
"vpkg %%v17,%%v16,%%v2\n\t"
|
||||||
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
|
"vperm %%v16,%%v16,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v2,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,48(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v19,%%v18,%%v2\n\t"
|
"vpkg %%v19,%%v18,%%v2\n\t"
|
||||||
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
|
"vperm %%v18,%%v18,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v2,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,80(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v21,%%v20,%%v2\n\t"
|
"vpkg %%v21,%%v20,%%v2\n\t"
|
||||||
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
|
"vperm %%v20,%%v20,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v2,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,112(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v23,%%v22,%%v2\n\t"
|
"vpkg %%v23,%%v22,%%v2\n\t"
|
||||||
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
|
"vperm %%v22,%%v22,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v24,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24,128(%%r1,%2) \n\t"
|
"vl %%v2,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,144(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v25,%%v24,%%v2\n\t"
|
"vpkg %%v25,%%v24,%%v2\n\t"
|
||||||
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
|
"vperm %%v24,%%v24,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v26,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26,160(%%r1,%2) \n\t"
|
"vl %%v2,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,176(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v27,%%v26,%%v2\n\t"
|
"vpkg %%v27,%%v26,%%v2\n\t"
|
||||||
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
|
"vperm %%v26,%%v26,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v28,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,192(%%r1,%2) \n\t"
|
"vl %%v2,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,208(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v29,%%v28,%%v2\n\t"
|
"vpkg %%v29,%%v28,%%v2\n\t"
|
||||||
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
|
"vperm %%v28,%%v28,%%v2,%%v1\n\t"
|
||||||
|
"vl %%v30,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,224(%%r1,%2) \n\t"
|
"vl %%v2,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v2,240(%%r1,%2) \n\t"
|
|
||||||
"vpkg %%v31,%%v30,%%v2\n\t"
|
"vpkg %%v31,%%v30,%%v2\n\t"
|
||||||
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
|
"vperm %%v30,%%v30,%%v2,%%v1\n\t"
|
||||||
|
|
||||||
"vflpsb %%v16,%%v16\n\t"
|
"vflpsb %%v16,%%v16\n\t"
|
||||||
"vflpsb %%v17,%%v17\n\t"
|
"vflpsb %%v17,%%v17\n\t"
|
||||||
"vflpsb %%v18,%%v18\n\t"
|
"vflpsb %%v18,%%v18\n\t"
|
||||||
|
|
@ -129,7 +112,6 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vflpsb %%v29,%%v29\n\t"
|
"vflpsb %%v29,%%v29\n\t"
|
||||||
"vflpsb %%v30,%%v30\n\t"
|
"vflpsb %%v30,%%v30\n\t"
|
||||||
"vflpsb %%v31,%%v31\n\t"
|
"vflpsb %%v31,%%v31\n\t"
|
||||||
|
|
||||||
"vfasb %%v16,%%v16,%%v17\n\t"
|
"vfasb %%v16,%%v16,%%v17\n\t"
|
||||||
"vfasb %%v18,%%v18,%%v19\n\t"
|
"vfasb %%v18,%%v18,%%v19\n\t"
|
||||||
"vfasb %%v20,%%v20,%%v21\n\t"
|
"vfasb %%v20,%%v20,%%v21\n\t"
|
||||||
|
|
@ -138,32 +120,26 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vfasb %%v26,%%v26,%%v27\n\t"
|
"vfasb %%v26,%%v26,%%v27\n\t"
|
||||||
"vfasb %%v28,%%v28,%%v29\n\t"
|
"vfasb %%v28,%%v28,%%v29\n\t"
|
||||||
"vfasb %%v30,%%v30,%%v31\n\t"
|
"vfasb %%v30,%%v30,%%v31\n\t"
|
||||||
|
|
||||||
"vfminsb %%v16,%%v16,%%v24,0\n\t"
|
"vfminsb %%v16,%%v16,%%v24,0\n\t"
|
||||||
"vfminsb %%v18,%%v18,%%v26,0\n\t"
|
"vfminsb %%v18,%%v18,%%v26,0\n\t"
|
||||||
"vfminsb %%v20,%%v20,%%v28,0\n\t"
|
"vfminsb %%v20,%%v20,%%v28,0\n\t"
|
||||||
"vfminsb %%v22,%%v22,%%v30,0\n\t"
|
"vfminsb %%v22,%%v22,%%v30,0\n\t"
|
||||||
|
|
||||||
"vfminsb %%v16,%%v16,%%v20,0\n\t"
|
"vfminsb %%v16,%%v16,%%v20,0\n\t"
|
||||||
"vfminsb %%v18,%%v18,%%v22,0\n\t"
|
"vfminsb %%v18,%%v18,%%v22,0\n\t"
|
||||||
|
|
||||||
"vfminsb %%v16,%%v16,%%v18,0\n\t"
|
"vfminsb %%v16,%%v16,%%v18,0\n\t"
|
||||||
|
|
||||||
"vfminsb %%v0,%%v0,%%v16,0\n\t"
|
"vfminsb %%v0,%%v0,%%v16,0\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"veslg %%v16,%%v0,32\n\t"
|
"veslg %%v16,%%v0,32\n\t"
|
||||||
"vfminsb %%v0,%%v0,%%v16,0\n\t"
|
"vfminsb %%v0,%%v0,%%v16,0\n\t"
|
||||||
|
|
||||||
"vrepf %%v16,%%v0,2\n\t"
|
"vrepf %%v16,%%v0,2\n\t"
|
||||||
"wfminsb %%v0,%%v0,%%v16,0\n\t"
|
"wfminsb %%v0,%%v0,%%v16,0\n\t"
|
||||||
"ler %0,%%f0 "
|
"ler %[amin],%%f0"
|
||||||
:"=f"(amin)
|
: [amin] "=f"(amin),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
|
||||||
);
|
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
|
|
||||||
return amin;
|
return amin;
|
||||||
}
|
}
|
||||||
|
|
@ -174,7 +150,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT minf = 0.0;
|
FLOAT minf = 0.0;
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (minf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (minf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -184,9 +161,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
minf = camin_kernel_32(n1, x);
|
minf = camin_kernel_32(n1, x);
|
||||||
ix = n1 * 2;
|
ix = n1 * 2;
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = CABS1(x, 0);
|
minf = CABS1(x, 0);
|
||||||
ix += 2;
|
ix += 2;
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -228,7 +203,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (i < n) {
|
while (i < n) {
|
||||||
if (CABS1(x, ix) < minf) {
|
if (CABS1(x, ix) < minf) {
|
||||||
minf = CABS1(x, ix);
|
minf = CABS1(x, ix);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,34 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
#define ABS fabsf
|
||||||
#endif
|
|
||||||
|
|
||||||
static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
|
static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT asum;
|
FLOAT asum;
|
||||||
|
|
||||||
__asm__ (
|
__asm__("vzero %%v24\n\t"
|
||||||
"vzero %%v0 \n\t"
|
"vzero %%v25\n\t"
|
||||||
"vzero %%v1 \n\t"
|
"vzero %%v26\n\t"
|
||||||
"vzero %%v2 \n\t"
|
"vzero %%v27\n\t"
|
||||||
"vzero %%v3 \n\t"
|
"vzero %%v28\n\t"
|
||||||
"srlg %%r0,%1,5 \n\t"
|
"vzero %%v29\n\t"
|
||||||
|
"vzero %%v30\n\t"
|
||||||
|
"vzero %%v31\n\t"
|
||||||
|
"srlg %[n],%[n],5\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||||
|
|
||||||
"vflpsb %%v16, %%v16\n\t"
|
"vflpsb %%v16, %%v16\n\t"
|
||||||
"vflpsb %%v17, %%v17\n\t"
|
"vflpsb %%v17, %%v17\n\t"
|
||||||
"vflpsb %%v18, %%v18\n\t"
|
"vflpsb %%v18, %%v18\n\t"
|
||||||
|
|
@ -64,25 +61,22 @@ static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vflpsb %%v21, %%v21\n\t"
|
"vflpsb %%v21, %%v21\n\t"
|
||||||
"vflpsb %%v22, %%v22\n\t"
|
"vflpsb %%v22, %%v22\n\t"
|
||||||
"vflpsb %%v23, %%v23\n\t"
|
"vflpsb %%v23, %%v23\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||||
"vfasb %%v1,%%v1,%%v17 \n\t"
|
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||||
"vfasb %%v2,%%v2,%%v18 \n\t"
|
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||||
"vfasb %%v3,%%v3,%%v19 \n\t"
|
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v20 \n\t"
|
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||||
"vfasb %%v1,%%v1,%%v21 \n\t"
|
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||||
"vfasb %%v2,%%v2,%%v22 \n\t"
|
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||||
"vfasb %%v3,%%v3,%%v23 \n\t"
|
"vl %%v16, 128(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v17, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
"vl %%v18, 160(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
"vl %%v19, 176(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
"vl %%v20, 192(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
"vl %%v21, 208(%%r1,%[x])\n\t"
|
||||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
"vl %%v22, 224(%%r1,%[x])\n\t"
|
||||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
"vl %%v23, 240(%%r1,%[x])\n\t"
|
||||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
|
||||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vflpsb %%v16, %%v16\n\t"
|
"vflpsb %%v16, %%v16\n\t"
|
||||||
"vflpsb %%v17, %%v17\n\t"
|
"vflpsb %%v17, %%v17\n\t"
|
||||||
"vflpsb %%v18, %%v18\n\t"
|
"vflpsb %%v18, %%v18\n\t"
|
||||||
|
|
@ -91,70 +85,66 @@ static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vflpsb %%v21, %%v21\n\t"
|
"vflpsb %%v21, %%v21\n\t"
|
||||||
"vflpsb %%v22, %%v22\n\t"
|
"vflpsb %%v22, %%v22\n\t"
|
||||||
"vflpsb %%v23, %%v23\n\t"
|
"vflpsb %%v23, %%v23\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||||
"vfasb %%v1,%%v1,%%v17 \n\t"
|
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||||
"vfasb %%v2,%%v2,%%v18 \n\t"
|
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||||
"vfasb %%v3,%%v3,%%v19 \n\t"
|
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v20 \n\t"
|
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||||
"vfasb %%v1,%%v1,%%v21 \n\t"
|
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||||
"vfasb %%v2,%%v2,%%v22 \n\t"
|
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||||
"vfasb %%v3,%%v3,%%v23 \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v2 \n\t"
|
"vfasb %%v24,%%v24,%%v26\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v3 \n\t"
|
"vfasb %%v24,%%v24,%%v27\n\t"
|
||||||
"veslg %%v1,%%v0,32 \n\t"
|
"vfasb %%v24,%%v24,%%v28\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
"vfasb %%v24,%%v24,%%v29\n\t"
|
||||||
"vrepf %%v1,%%v0,2 \n\t"
|
"vfasb %%v24,%%v24,%%v30\n\t"
|
||||||
"aebr %%f0,%%f1 \n\t"
|
"vfasb %%v24,%%v24,%%v31\n\t"
|
||||||
"ler %0,%%f0 "
|
"veslg %%v25,%%v24,32\n\t"
|
||||||
:"=f"(asum)
|
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
"vrepf %%v25,%%v24,2\n\t"
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
|
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||||
);
|
"vstef %%v24,%[asum],0"
|
||||||
|
: [asum] "=Q"(asum),[n] "+&r"(n)
|
||||||
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
|
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||||
|
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return asum;
|
return asum;
|
||||||
}
|
}
|
||||||
|
|
||||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
{
|
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ip = 0;
|
BLASLONG ip = 0;
|
||||||
FLOAT sumf = 0.0;
|
FLOAT sumf = 0.0;
|
||||||
BLASLONG n1;
|
BLASLONG n1;
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (sumf);
|
||||||
|
|
||||||
if ( inc_x == 1 )
|
if (inc_x == 1) {
|
||||||
{
|
|
||||||
|
|
||||||
n1 = n & -32;
|
n1 = n & -32;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
|
|
||||||
sumf = casum_kernel_32(n1, x);
|
sumf = casum_kernel_32(n1, x);
|
||||||
i = n1;
|
i = n1;
|
||||||
ip = 2 * n1;
|
ip = 2 * n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
|
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
|
||||||
i++;
|
i++;
|
||||||
ip += 2;
|
ip += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
inc_x2 = 2 * inc_x;
|
inc_x2 = 2 * inc_x;
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
|
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
|
||||||
ip += inc_x2;
|
ip += inc_x2;
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -163,5 +153,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
}
|
}
|
||||||
return (sumf);
|
return (sumf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,100 +27,95 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
|
||||||
{
|
__asm__(
|
||||||
__asm__ volatile(
|
|
||||||
#if !defined(CONJ)
|
#if !defined(CONJ)
|
||||||
"vlrepf %%v0,0(%3) \n\t"
|
"vlrepf %%v0,0(%[alpha])\n\t"
|
||||||
"vlef %%v1,4(%3),0 \n\t"
|
"vlef %%v1,4(%[alpha]),0\n\t"
|
||||||
"vlef %%v1,4(%3),2 \n\t"
|
"vlef %%v1,4(%[alpha]),2\n\t"
|
||||||
"vflcsb %%v1,%%v1\n\t"
|
"vflcsb %%v1,%%v1\n\t"
|
||||||
"vlef %%v1,4(%3),1 \n\t"
|
"vlef %%v1,4(%[alpha]),1\n\t"
|
||||||
"vlef %%v1,4(%3),3 \n\t"
|
"vlef %%v1,4(%[alpha]),3\n\t"
|
||||||
#else
|
#else
|
||||||
"vlef %%v0,0(%3),1 \n\t"
|
"vlef %%v0,0(%[alpha]),1\n\t"
|
||||||
"vlef %%v0,0(%3),3 \n\t"
|
"vlef %%v0,0(%[alpha]),3\n\t"
|
||||||
"vflcsb %%v0,%%v0\n\t"
|
"vflcsb %%v0,%%v0\n\t"
|
||||||
"vlef %%v0,0(%3),0 \n\t"
|
"vlef %%v0,0(%[alpha]),0\n\t"
|
||||||
"vlef %%v0,0(%3),2 \n\t"
|
"vlef %%v0,0(%[alpha]),2\n\t"
|
||||||
"vlrepf %%v1,4(%3) \n\t"
|
"vlrepf %%v1,4(%[alpha])\n\t"
|
||||||
#endif
|
#endif
|
||||||
"srlg %%r0,%0,4 \n\t"
|
"srlg %[n],%[n],4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v8,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v9,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%1) \n\t"
|
"vl %%v10,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%1) \n\t"
|
"vl %%v11,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%1) \n\t"
|
"vl %%v12,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v20,0(%%r1,%2) \n\t"
|
"vl %%v13,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v21,16(%%r1,%2) \n\t"
|
"vl %%v14,32(%%r1,%[y])\n\t"
|
||||||
"vl %%v22,32(%%r1,%2) \n\t"
|
"vl %%v15,48(%%r1,%[y])\n\t"
|
||||||
"vl %%v23,48(%%r1,%2) \n\t"
|
"vl %%v16,64(%%r1,%[x])\n\t"
|
||||||
"verllg %%v24,%%v16,32 \n\t"
|
"vl %%v17,80(%%r1,%[x])\n\t"
|
||||||
"verllg %%v25,%%v17,32 \n\t"
|
"vl %%v18,96(%%r1,%[x])\n\t"
|
||||||
"verllg %%v26,%%v18,32 \n\t"
|
"vl %%v19,112(%%r1,%[x])\n\t"
|
||||||
"verllg %%v27,%%v19,32 \n\t"
|
"vl %%v20,64(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v21,80(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
|
"vl %%v22,96(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
|
"vl %%v23,112(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
|
"verllg %%v24,%%v8,32\n\t"
|
||||||
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"
|
"verllg %%v25,%%v9,32\n\t"
|
||||||
|
"verllg %%v26,%%v10,32\n\t"
|
||||||
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
|
"verllg %%v27,%%v11,32\n\t"
|
||||||
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
|
"verllg %%v28,%%v16,32\n\t"
|
||||||
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
|
"verllg %%v29,%%v17,32\n\t"
|
||||||
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"
|
"verllg %%v30,%%v18,32\n\t"
|
||||||
|
"verllg %%v31,%%v19,32\n\t"
|
||||||
"vst %%v28,0(%%r1,%2) \n\t"
|
"vfmasb %%v8,%%v8,%%v0,%%v12\n\t"
|
||||||
"vst %%v29,16(%%r1,%2) \n\t"
|
"vfmasb %%v9,%%v9,%%v0,%%v13\n\t"
|
||||||
"vst %%v30,32(%%r1,%2) \n\t"
|
"vfmasb %%v10,%%v10,%%v0,%%v14\n\t"
|
||||||
"vst %%v31,48(%%r1,%2) \n\t"
|
"vfmasb %%v11,%%v11,%%v0,%%v15\n\t"
|
||||||
|
"vfmasb %%v16,%%v16,%%v0,%%v20\n\t"
|
||||||
"vl %%v16,64(%%r1,%1) \n\t"
|
"vfmasb %%v17,%%v17,%%v0,%%v21\n\t"
|
||||||
"vl %%v17,80(%%r1,%1) \n\t"
|
"vfmasb %%v18,%%v18,%%v0,%%v22\n\t"
|
||||||
"vl %%v18,96(%%r1,%1) \n\t"
|
"vfmasb %%v19,%%v19,%%v0,%%v23\n\t"
|
||||||
"vl %%v19,112(%%r1,%1) \n\t"
|
"vfmasb %%v8,%%v24,%%v1,%%v8\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vfmasb %%v9,%%v25,%%v1,%%v9\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vfmasb %%v10,%%v26,%%v1,%%v10\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vfmasb %%v11,%%v27,%%v1,%%v11\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
"vfmasb %%v16,%%v28,%%v1,%%v16\n\t"
|
||||||
"verllg %%v24,%%v16,32 \n\t"
|
"vfmasb %%v17,%%v29,%%v1,%%v17\n\t"
|
||||||
"verllg %%v25,%%v17,32 \n\t"
|
"vfmasb %%v18,%%v30,%%v1,%%v18\n\t"
|
||||||
"verllg %%v26,%%v18,32 \n\t"
|
"vfmasb %%v19,%%v31,%%v1,%%v19\n\t"
|
||||||
"verllg %%v27,%%v19,32 \n\t"
|
"vst %%v8,0(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v9,16(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
|
"vst %%v10,32(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
|
"vst %%v11,48(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
|
"vst %%v16,64(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"
|
"vst %%v17,80(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v18,96(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
|
"vst %%v19,112(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
|
|
||||||
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
|
|
||||||
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"
|
|
||||||
|
|
||||||
"vst %%v28,64(%%r1,%2) \n\t"
|
|
||||||
"vst %%v29,80(%%r1,%2) \n\t"
|
|
||||||
"vst %%v30,96(%%r1,%2) \n\t"
|
|
||||||
"vst %%v31,112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
|
||||||
|
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||||
|
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
|
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||||
|
BLASLONG dummy2) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
FLOAT da[2] __attribute__ ((aligned(16)));
|
FLOAT da[2] __attribute__ ((aligned(16)));
|
||||||
|
|
||||||
if (n <= 0) return (0);
|
if (n <= 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ((inc_x == 1) && (inc_y == 1)) {
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
|
|
||||||
|
|
@ -147,7 +142,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
}
|
}
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inc_x *= 2;
|
inc_x *= 2;
|
||||||
|
|
@ -170,5 +164,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,46 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||||
{
|
__asm__("srlg %[n],%[n],5\n\t"
|
||||||
__asm__ volatile (
|
|
||||||
"lgr %%r1,%1 \n\t"
|
|
||||||
"lgr %%r2,%2 \n\t"
|
|
||||||
"srlg %%r0,%0,5 \n\t"
|
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1) \n\t"
|
"pfd 1, 1024(%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r2) \n\t"
|
"pfd 2, 1024(%[y])\n\t"
|
||||||
"mvc 0(256,%%r2),0(%%r1) \n\t"
|
"mvc 0(256,%[y]),0(%[x])\n\t"
|
||||||
"agfi %%r1,256 \n\t"
|
"la %[x],256(%[x])\n\t"
|
||||||
"agfi %%r2,256 \n\t"
|
"la %[y],256(%[y])\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
|
||||||
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
|
[n] "+&r"(n)
|
||||||
:"memory","cc","r0","r1","r2"
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x)
|
||||||
);
|
: "cc");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||||
{
|
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
|
|
||||||
if ( n <= 0 ) return(0);
|
if (n <= 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -32;
|
BLASLONG n1 = n & -32;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
ccopy_kernel_32(n1, x, y);
|
ccopy_kernel_32(n1, x, y);
|
||||||
i = n1;
|
i = n1;
|
||||||
ix = n1 * 2;
|
ix = n1 * 2;
|
||||||
iy = n1 * 2;
|
iy = n1 * 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
y[iy] = x[iy];
|
y[iy] = x[iy];
|
||||||
y[iy + 1] = x[ix + 1];
|
y[iy + 1] = x[ix + 1];
|
||||||
ix += 2;
|
ix += 2;
|
||||||
|
|
@ -75,16 +68,12 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG inc_x2 = 2 * inc_x;
|
BLASLONG inc_x2 = 2 * inc_x;
|
||||||
BLASLONG inc_y2 = 2 * inc_y;
|
BLASLONG inc_y2 = 2 * inc_y;
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
y[iy] = x[ix];
|
y[iy] = x[ix];
|
||||||
y[iy + 1] = x[ix + 1];
|
y[iy + 1] = x[ix + 1];
|
||||||
ix += inc_x2;
|
ix += inc_x2;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,10 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
|
||||||
{
|
__asm__("vzero %%v24\n\t"
|
||||||
__asm__ volatile(
|
|
||||||
"vzero %%v24 \n\t"
|
|
||||||
"vzero %%v25\n\t"
|
"vzero %%v25\n\t"
|
||||||
"vzero %%v26\n\t"
|
"vzero %%v26\n\t"
|
||||||
"vzero %%v27\n\t"
|
"vzero %%v27\n\t"
|
||||||
|
|
@ -38,25 +36,23 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||||
"vzero %%v29\n\t"
|
"vzero %%v29\n\t"
|
||||||
"vzero %%v30\n\t"
|
"vzero %%v30\n\t"
|
||||||
"vzero %%v31\n\t"
|
"vzero %%v31\n\t"
|
||||||
"srlg %%r0,%0,4 \n\t"
|
"srlg %[n],%[n],4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
"vl %%v0, 0(%%r1,%[y])\n\t"
|
||||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
"vl %%v1, 16(%%r1,%[y])\n\t"
|
||||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
"vl %%v2, 32(%%r1,%[y])\n\t"
|
||||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
"vl %%v3, 48(%%r1,%[y])\n\t"
|
||||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
|
||||||
"verllg %%v20,%%v16,32\n\t"
|
"verllg %%v20,%%v16,32\n\t"
|
||||||
"verllg %%v21,%%v17,32\n\t"
|
"verllg %%v21,%%v17,32\n\t"
|
||||||
"verllg %%v22,%%v18,32\n\t"
|
"verllg %%v22,%%v18,32\n\t"
|
||||||
"verllg %%v23,%%v19,32\n\t"
|
"verllg %%v23,%%v19,32\n\t"
|
||||||
|
|
||||||
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
|
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
|
||||||
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
|
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
|
||||||
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
|
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
|
||||||
|
|
@ -65,20 +61,18 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||||
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
|
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
|
||||||
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
|
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
|
||||||
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
|
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
|
||||||
|
"vl %%v16, 64(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 64(%%r1,%1) \n\t"
|
"vl %%v17, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 80(%%r1,%1) \n\t"
|
"vl %%v18, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 96(%%r1,%1) \n\t"
|
"vl %%v19, 112(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 112(%%r1,%1) \n\t"
|
"vl %%v0, 64(%%r1,%[y])\n\t"
|
||||||
"vl %%v0, 64(%%r1,%2) \n\t"
|
"vl %%v1, 80(%%r1,%[y])\n\t"
|
||||||
"vl %%v1, 80(%%r1,%2) \n\t"
|
"vl %%v2, 96(%%r1,%[y])\n\t"
|
||||||
"vl %%v2, 96(%%r1,%2) \n\t"
|
"vl %%v3, 112(%%r1,%[y])\n\t"
|
||||||
"vl %%v3, 112(%%r1,%2) \n\t"
|
|
||||||
"verllg %%v20,%%v16,32\n\t"
|
"verllg %%v20,%%v16,32\n\t"
|
||||||
"verllg %%v21,%%v17,32\n\t"
|
"verllg %%v21,%%v17,32\n\t"
|
||||||
"verllg %%v22,%%v18,32\n\t"
|
"verllg %%v22,%%v18,32\n\t"
|
||||||
"verllg %%v23,%%v19,32\n\t"
|
"verllg %%v23,%%v19,32\n\t"
|
||||||
|
|
||||||
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
|
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
|
||||||
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
|
"vfmasb %%v25,%%v20,%%v0,%%v25\n\t"
|
||||||
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
|
"vfmasb %%v26,%%v17,%%v1,%%v26\n\t"
|
||||||
|
|
@ -87,9 +81,8 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||||
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
|
"vfmasb %%v29,%%v22,%%v2,%%v29\n\t"
|
||||||
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
|
"vfmasb %%v30,%%v19,%%v3,%%v30\n\t"
|
||||||
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
|
"vfmasb %%v31,%%v23,%%v3,%%v31\n\t"
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
"vfasb %%v24,%%v24,%%v26\n\t"
|
"vfasb %%v24,%%v24,%%v26\n\t"
|
||||||
"vfasb %%v24,%%v24,%%v28\n\t"
|
"vfasb %%v24,%%v24,%%v28\n\t"
|
||||||
"vfasb %%v24,%%v24,%%v30\n\t"
|
"vfasb %%v24,%%v24,%%v30\n\t"
|
||||||
|
|
@ -100,21 +93,25 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||||
"vfasb %%v25,%%v25,%%v31\n\t"
|
"vfasb %%v25,%%v25,%%v31\n\t"
|
||||||
"vrepg %%v27,%%v25,1\n\t"
|
"vrepg %%v27,%%v25,1\n\t"
|
||||||
"vfasb %%v25,%%v25,%%v27\n\t"
|
"vfasb %%v25,%%v25,%%v27\n\t"
|
||||||
"vstef %%v24,0(%3),0 \n\t"
|
"vstef %%v24,0(%[d]),0\n\t"
|
||||||
"vstef %%v24,4(%3),1 \n\t"
|
"vstef %%v24,4(%[d]),1\n\t"
|
||||||
"vstef %%v25,8(%3),1 \n\t"
|
"vstef %%v25,8(%[d]),1\n\t"
|
||||||
"vstef %%v25,12(%3),0 "
|
"vstef %%v25,12(%[d]),0"
|
||||||
:
|
: "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
|
: [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
|
||||||
|
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||||
|
BLASLONG inc_y) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
BLASLONG ix, iy;
|
BLASLONG ix, iy;
|
||||||
OPENBLAS_COMPLEX_FLOAT result;
|
OPENBLAS_COMPLEX_FLOAT result;
|
||||||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
|
FLOAT dot[4] __attribute__ ((aligned(16))) = {
|
||||||
|
0.0, 0.0, 0.0, 0.0};
|
||||||
|
|
||||||
if (n <= 0) {
|
if (n <= 0) {
|
||||||
CREAL(result) = 0.0;
|
CREAL(result) = 0.0;
|
||||||
|
|
@ -145,7 +142,6 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
i = 0;
|
i = 0;
|
||||||
ix = 0;
|
ix = 0;
|
||||||
|
|
@ -178,5 +174,3 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||||
return (result);
|
return (result);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2014, The OpenBLAS Project
|
Copyright (c) 2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -25,304 +25,347 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#define NBMAX 2048
|
#define NBMAX 2048
|
||||||
|
|
||||||
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
|
||||||
{
|
register FLOAT *ap0 = ap[0];
|
||||||
__asm__ volatile (
|
register FLOAT *ap1 = ap[1];
|
||||||
"vlrepg %%v16,0(%5) \n\t"
|
register FLOAT *ap2 = ap[2];
|
||||||
"vlrepg %%v17,8(%5) \n\t"
|
register FLOAT *ap3 = ap[3];
|
||||||
"vlrepg %%v18,16(%5) \n\t"
|
|
||||||
"vlrepg %%v19,24(%5) \n\t"
|
__asm__("vlrepg %%v16,0(%[x])\n\t"
|
||||||
|
"vlrepg %%v17,8(%[x])\n\t"
|
||||||
|
"vlrepg %%v18,16(%[x])\n\t"
|
||||||
|
"vlrepg %%v19,24(%[x])\n\t"
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vlef %%v20,4(%5),0 \n\t"
|
"vlef %%v20,4(%[x]),0\n\t"
|
||||||
"vlef %%v20,4(%5),2 \n\t"
|
"vlef %%v20,4(%[x]),2\n\t"
|
||||||
"vflcsb %%v20,%%v20\n\t"
|
"vflcsb %%v20,%%v20\n\t"
|
||||||
"vlef %%v20,0(%5),1 \n\t"
|
"vlef %%v20,0(%[x]),1\n\t"
|
||||||
"vlef %%v20,0(%5),3 \n\t"
|
"vlef %%v20,0(%[x]),3\n\t"
|
||||||
|
"vlef %%v21,12(%[x]),0\n\t"
|
||||||
"vlef %%v21,12(%5),0 \n\t"
|
"vlef %%v21,12(%[x]),2\n\t"
|
||||||
"vlef %%v21,12(%5),2 \n\t"
|
|
||||||
"vflcsb %%v21,%%v21\n\t"
|
"vflcsb %%v21,%%v21\n\t"
|
||||||
"vlef %%v21,8(%5),1 \n\t"
|
"vlef %%v21,8(%[x]),1\n\t"
|
||||||
"vlef %%v21,8(%5),3 \n\t"
|
"vlef %%v21,8(%[x]),3\n\t"
|
||||||
|
"vlef %%v22,20(%[x]),0\n\t"
|
||||||
"vlef %%v22,20(%5),0 \n\t"
|
"vlef %%v22,20(%[x]),2\n\t"
|
||||||
"vlef %%v22,20(%5),2 \n\t"
|
|
||||||
"vflcsb %%v22,%%v22\n\t"
|
"vflcsb %%v22,%%v22\n\t"
|
||||||
"vlef %%v22,16(%5),1 \n\t"
|
"vlef %%v22,16(%[x]),1\n\t"
|
||||||
"vlef %%v22,16(%5),3 \n\t"
|
"vlef %%v22,16(%[x]),3\n\t"
|
||||||
|
"vlef %%v23,28(%[x]),0\n\t"
|
||||||
"vlef %%v23,28(%5),0 \n\t"
|
"vlef %%v23,28(%[x]),2\n\t"
|
||||||
"vlef %%v23,28(%5),2 \n\t"
|
|
||||||
"vflcsb %%v23,%%v23\n\t"
|
"vflcsb %%v23,%%v23\n\t"
|
||||||
"vlef %%v23,24(%5),1 \n\t"
|
"vlef %%v23,24(%[x]),1\n\t"
|
||||||
"vlef %%v23,24(%5),3 \n\t"
|
"vlef %%v23,24(%[x]),3\n\t"
|
||||||
#else
|
#else
|
||||||
"vlef %%v20,0(%5),1 \n\t"
|
"vlef %%v20,0(%[x]),1\n\t"
|
||||||
"vlef %%v20,0(%5),3 \n\t"
|
"vlef %%v20,0(%[x]),3\n\t"
|
||||||
"vflcsb %%v20,%%v20\n\t"
|
"vflcsb %%v20,%%v20\n\t"
|
||||||
"vlef %%v20,4(%5),0 \n\t"
|
"vlef %%v20,4(%[x]),0\n\t"
|
||||||
"vlef %%v20,4(%5),2 \n\t"
|
"vlef %%v20,4(%[x]),2\n\t"
|
||||||
|
"vlef %%v21,8(%[x]),1\n\t"
|
||||||
"vlef %%v21,8(%5),1 \n\t"
|
"vlef %%v21,8(%[x]),3\n\t"
|
||||||
"vlef %%v21,8(%5),3 \n\t"
|
|
||||||
"vflcsb %%v21,%%v21\n\t"
|
"vflcsb %%v21,%%v21\n\t"
|
||||||
"vlef %%v21,12(%5),0 \n\t"
|
"vlef %%v21,12(%[x]),0\n\t"
|
||||||
"vlef %%v21,12(%5),2 \n\t"
|
"vlef %%v21,12(%[x]),2\n\t"
|
||||||
|
"vlef %%v22,16(%[x]),1\n\t"
|
||||||
"vlef %%v22,16(%5),1 \n\t"
|
"vlef %%v22,16(%[x]),3\n\t"
|
||||||
"vlef %%v22,16(%5),3 \n\t"
|
|
||||||
"vflcsb %%v22,%%v22\n\t"
|
"vflcsb %%v22,%%v22\n\t"
|
||||||
"vlef %%v22,20(%5),0 \n\t"
|
"vlef %%v22,20(%[x]),0\n\t"
|
||||||
"vlef %%v22,20(%5),2 \n\t"
|
"vlef %%v22,20(%[x]),2\n\t"
|
||||||
|
"vlef %%v23,24(%[x]),1\n\t"
|
||||||
"vlef %%v23,24(%5),1 \n\t"
|
"vlef %%v23,24(%[x]),3\n\t"
|
||||||
"vlef %%v23,24(%5),3 \n\t"
|
|
||||||
"vflcsb %%v23,%%v23\n\t"
|
"vflcsb %%v23,%%v23\n\t"
|
||||||
"vlef %%v23,28(%5),0 \n\t"
|
"vlef %%v23,28(%[x]),0\n\t"
|
||||||
"vlef %%v23,28(%5),2 \n\t"
|
"vlef %%v23,28(%[x]),2\n\t"
|
||||||
#endif
|
#endif
|
||||||
|
"vleib %%v1,0,0\n\t"
|
||||||
|
"vleib %%v1,1,1\n\t"
|
||||||
|
"vleib %%v1,2,2\n\t"
|
||||||
|
"vleib %%v1,3,3\n\t"
|
||||||
|
"vleib %%v1,0,4\n\t"
|
||||||
|
"vleib %%v1,1,5\n\t"
|
||||||
|
"vleib %%v1,2,6\n\t"
|
||||||
|
"vleib %%v1,3,7\n\t"
|
||||||
|
"vleib %%v1,8,8\n\t"
|
||||||
|
"vleib %%v1,9,9\n\t"
|
||||||
|
"vleib %%v1,10,10\n\t"
|
||||||
|
"vleib %%v1,11,11\n\t"
|
||||||
|
"vleib %%v1,8,12\n\t"
|
||||||
|
"vleib %%v1,9,13\n\t"
|
||||||
|
"vleib %%v1,10,14\n\t"
|
||||||
|
"vleib %%v1,11,15\n\t"
|
||||||
|
"vleib %%v2,4,0\n\t"
|
||||||
|
"vleib %%v2,5,1\n\t"
|
||||||
|
"vleib %%v2,6,2\n\t"
|
||||||
|
"vleib %%v2,7,3\n\t"
|
||||||
|
"vleib %%v2,4,4\n\t"
|
||||||
|
"vleib %%v2,5,5\n\t"
|
||||||
|
"vleib %%v2,6,6\n\t"
|
||||||
|
"vleib %%v2,7,7\n\t"
|
||||||
|
"vleib %%v2,12,8\n\t"
|
||||||
|
"vleib %%v2,13,9\n\t"
|
||||||
|
"vleib %%v2,14,10\n\t"
|
||||||
|
"vleib %%v2,15,11\n\t"
|
||||||
|
"vleib %%v2,12,12\n\t"
|
||||||
|
"vleib %%v2,13,13\n\t"
|
||||||
|
"vleib %%v2,14,14\n\t"
|
||||||
|
"vleib %%v2,15,15\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,1 \n\t"
|
"srlg %[n],%[n],1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||||
"pfd 1,1024(%%r1,%3) \n\t"
|
"pfd 1,1024(%%r1,%[ap2])\n\t"
|
||||||
"pfd 1,1024(%%r1,%4) \n\t"
|
"pfd 1,1024(%%r1,%[ap3])\n\t"
|
||||||
"pfd 2,1024(%%r1,%6) \n\t"
|
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v24,0(%%r1,%[ap0])\n\t"
|
||||||
"vlef %%v24,0(%%r1,%1),0 \n\t"
|
"vperm %%v25,%%v24,%%v24,%%v2\n\t"
|
||||||
"vlef %%v24,0(%%r1,%1),1 \n\t"
|
"vperm %%v24,%%v24,%%v24,%%v1\n\t"
|
||||||
"vlef %%v24,8(%%r1,%1),2 \n\t"
|
"vl %%v26,0(%%r1,%[ap1])\n\t"
|
||||||
"vlef %%v24,8(%%r1,%1),3 \n\t"
|
"vperm %%v27,%%v26,%%v26,%%v2\n\t"
|
||||||
"vlef %%v25,4(%%r1,%1),0 \n\t"
|
"vperm %%v26,%%v26,%%v26,%%v1\n\t"
|
||||||
"vlef %%v25,4(%%r1,%1),1 \n\t"
|
"vl %%v0,0(%%r1,%[y])\n\t"
|
||||||
"vlef %%v25,12(%%r1,%1),2 \n\t"
|
|
||||||
"vlef %%v25,12(%%r1,%1),3 \n\t"
|
|
||||||
"vlef %%v26,0(%%r1,%2),0 \n\t"
|
|
||||||
"vlef %%v26,0(%%r1,%2),1 \n\t"
|
|
||||||
"vlef %%v26,8(%%r1,%2),2 \n\t"
|
|
||||||
"vlef %%v26,8(%%r1,%2),3 \n\t"
|
|
||||||
"vlef %%v27,4(%%r1,%2),0 \n\t"
|
|
||||||
"vlef %%v27,4(%%r1,%2),1 \n\t"
|
|
||||||
"vlef %%v27,12(%%r1,%2),2 \n\t"
|
|
||||||
"vlef %%v27,12(%%r1,%2),3 \n\t"
|
|
||||||
|
|
||||||
"vl %%v0,0(%%r1,%6) \n\t"
|
|
||||||
"vfmasb %%v0,%%v24,%%v16,%%v0\n\t"
|
"vfmasb %%v0,%%v24,%%v16,%%v0\n\t"
|
||||||
"vfmasb %%v0,%%v25,%%v20,%%v0\n\t"
|
"vfmasb %%v0,%%v25,%%v20,%%v0\n\t"
|
||||||
"vfmasb %%v0,%%v26,%%v17,%%v0\n\t"
|
"vfmasb %%v0,%%v26,%%v17,%%v0\n\t"
|
||||||
"vfmasb %%v0,%%v27,%%v21,%%v0\n\t"
|
"vfmasb %%v0,%%v27,%%v21,%%v0\n\t"
|
||||||
|
"vl %%v28,0(%%r1,%[ap2])\n\t"
|
||||||
"vlef %%v28,0(%%r1,%3),0 \n\t"
|
"vperm %%v29,%%v28,%%v28,%%v2\n\t"
|
||||||
"vlef %%v28,0(%%r1,%3),1 \n\t"
|
"vperm %%v28,%%v28,%%v28,%%v1\n\t"
|
||||||
"vlef %%v28,8(%%r1,%3),2 \n\t"
|
"vl %%v30,0(%%r1,%[ap3])\n\t"
|
||||||
"vlef %%v28,8(%%r1,%3),3 \n\t"
|
"vperm %%v31,%%v30,%%v30,%%v2\n\t"
|
||||||
"vlef %%v29,4(%%r1,%3),0 \n\t"
|
"vperm %%v30,%%v30,%%v30,%%v1\n\t"
|
||||||
"vlef %%v29,4(%%r1,%3),1 \n\t"
|
|
||||||
"vlef %%v29,12(%%r1,%3),2 \n\t"
|
|
||||||
"vlef %%v29,12(%%r1,%3),3 \n\t"
|
|
||||||
"vlef %%v30,0(%%r1,%4),0 \n\t"
|
|
||||||
"vlef %%v30,0(%%r1,%4),1 \n\t"
|
|
||||||
"vlef %%v30,8(%%r1,%4),2 \n\t"
|
|
||||||
"vlef %%v30,8(%%r1,%4),3 \n\t"
|
|
||||||
"vlef %%v31,4(%%r1,%4),0 \n\t"
|
|
||||||
"vlef %%v31,4(%%r1,%4),1 \n\t"
|
|
||||||
"vlef %%v31,12(%%r1,%4),2 \n\t"
|
|
||||||
"vlef %%v31,12(%%r1,%4),3 \n\t"
|
|
||||||
|
|
||||||
"vfmasb %%v0,%%v28,%%v18,%%v0\n\t"
|
"vfmasb %%v0,%%v28,%%v18,%%v0\n\t"
|
||||||
"vfmasb %%v0,%%v29,%%v22,%%v0\n\t"
|
"vfmasb %%v0,%%v29,%%v22,%%v0\n\t"
|
||||||
"vfmasb %%v0,%%v30,%%v19,%%v0\n\t"
|
"vfmasb %%v0,%%v30,%%v19,%%v0\n\t"
|
||||||
"vfmasb %%v0,%%v31,%%v23,%%v0\n\t"
|
"vfmasb %%v0,%%v31,%%v23,%%v0\n\t"
|
||||||
"vst %%v0,0(%%r1,%6) \n\t"
|
"vst %%v0,0(%%r1,%[y])\n\t"
|
||||||
|
|
||||||
"agfi %%r1,16\n\t"
|
"agfi %%r1,16\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
|
||||||
);
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
|
||||||
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
|
||||||
|
"m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x)
|
||||||
|
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
|
||||||
|
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
|
||||||
{
|
register FLOAT *ap0 = ap[0];
|
||||||
__asm__ volatile (
|
register FLOAT *ap1 = ap[1];
|
||||||
"vlrepg %%v16,0(%3) \n\t"
|
|
||||||
"vlrepg %%v17,8(%3) \n\t"
|
__asm__("vlrepg %%v16,0(%[x])\n\t"
|
||||||
|
"vlrepg %%v17,8(%[x])\n\t"
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vlef %%v18,4(%3),0 \n\t"
|
"vlef %%v18,4(%[x]),0\n\t"
|
||||||
"vlef %%v18,4(%3),2 \n\t"
|
"vlef %%v18,4(%[x]),2\n\t"
|
||||||
"vflcsb %%v18,%%v18\n\t"
|
"vflcsb %%v18,%%v18\n\t"
|
||||||
"vlef %%v18,0(%3),1 \n\t"
|
"vlef %%v18,0(%[x]),1\n\t"
|
||||||
"vlef %%v18,0(%3),3 \n\t"
|
"vlef %%v18,0(%[x]),3\n\t"
|
||||||
|
"vlef %%v19,12(%[x]),0\n\t"
|
||||||
"vlef %%v19,12(%3),0 \n\t"
|
"vlef %%v19,12(%[x]),2\n\t"
|
||||||
"vlef %%v19,12(%3),2 \n\t"
|
|
||||||
"vflcsb %%v19,%%v19\n\t"
|
"vflcsb %%v19,%%v19\n\t"
|
||||||
"vlef %%v19,8(%3),1 \n\t"
|
"vlef %%v19,8(%[x]),1\n\t"
|
||||||
"vlef %%v19,8(%3),3 \n\t"
|
"vlef %%v19,8(%[x]),3\n\t"
|
||||||
#else
|
#else
|
||||||
"vlef %%v18,0(%3),1 \n\t"
|
"vlef %%v18,0(%[x]),1\n\t"
|
||||||
"vlef %%v18,0(%3),3 \n\t"
|
"vlef %%v18,0(%[x]),3\n\t"
|
||||||
"vflcsb %%v18,%%v18\n\t"
|
"vflcsb %%v18,%%v18\n\t"
|
||||||
"vlef %%v18,4(%3),0 \n\t"
|
"vlef %%v18,4(%[x]),0\n\t"
|
||||||
"vlef %%v18,4(%3),2 \n\t"
|
"vlef %%v18,4(%[x]),2\n\t"
|
||||||
|
"vlef %%v19,8(%[x]),1\n\t"
|
||||||
"vlef %%v19,8(%3),1 \n\t"
|
"vlef %%v19,8(%[x]),3\n\t"
|
||||||
"vlef %%v19,8(%3),3 \n\t"
|
|
||||||
"vflcsb %%v19,%%v19\n\t"
|
"vflcsb %%v19,%%v19\n\t"
|
||||||
"vlef %%v19,12(%3),0 \n\t"
|
"vlef %%v19,12(%[x]),0\n\t"
|
||||||
"vlef %%v19,12(%3),2 \n\t"
|
"vlef %%v19,12(%[x]),2\n\t"
|
||||||
#endif
|
#endif
|
||||||
|
"vleib %%v1,0,0\n\t"
|
||||||
|
"vleib %%v1,1,1\n\t"
|
||||||
|
"vleib %%v1,2,2\n\t"
|
||||||
|
"vleib %%v1,3,3\n\t"
|
||||||
|
"vleib %%v1,0,4\n\t"
|
||||||
|
"vleib %%v1,1,5\n\t"
|
||||||
|
"vleib %%v1,2,6\n\t"
|
||||||
|
"vleib %%v1,3,7\n\t"
|
||||||
|
"vleib %%v1,8,8\n\t"
|
||||||
|
"vleib %%v1,9,9\n\t"
|
||||||
|
"vleib %%v1,10,10\n\t"
|
||||||
|
"vleib %%v1,11,11\n\t"
|
||||||
|
"vleib %%v1,8,12\n\t"
|
||||||
|
"vleib %%v1,9,13\n\t"
|
||||||
|
"vleib %%v1,10,14\n\t"
|
||||||
|
"vleib %%v1,11,15\n\t"
|
||||||
|
"vleib %%v2,4,0\n\t"
|
||||||
|
"vleib %%v2,5,1\n\t"
|
||||||
|
"vleib %%v2,6,2\n\t"
|
||||||
|
"vleib %%v2,7,3\n\t"
|
||||||
|
"vleib %%v2,4,4\n\t"
|
||||||
|
"vleib %%v2,5,5\n\t"
|
||||||
|
"vleib %%v2,6,6\n\t"
|
||||||
|
"vleib %%v2,7,7\n\t"
|
||||||
|
"vleib %%v2,12,8\n\t"
|
||||||
|
"vleib %%v2,13,9\n\t"
|
||||||
|
"vleib %%v2,14,10\n\t"
|
||||||
|
"vleib %%v2,15,11\n\t"
|
||||||
|
"vleib %%v2,12,12\n\t"
|
||||||
|
"vleib %%v2,13,13\n\t"
|
||||||
|
"vleib %%v2,14,14\n\t"
|
||||||
|
"vleib %%v2,15,15\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,1 \n\t"
|
"srlg %[n],%[n],1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||||
"pfd 2,1024(%%r1,%4) \n\t"
|
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v20,0(%%r1,%[ap0])\n\t"
|
||||||
"vlef %%v20,0(%%r1,%1),0 \n\t"
|
"vperm %%v21,%%v20,%%v20,%%v2\n\t"
|
||||||
"vlef %%v20,0(%%r1,%1),1 \n\t"
|
"vperm %%v20,%%v20,%%v20,%%v1\n\t"
|
||||||
"vlef %%v20,8(%%r1,%1),2 \n\t"
|
"vl %%v22,0(%%r1,%[ap1])\n\t"
|
||||||
"vlef %%v20,8(%%r1,%1),3 \n\t"
|
"vperm %%v23,%%v22,%%v22,%%v2\n\t"
|
||||||
"vlef %%v21,4(%%r1,%1),0 \n\t"
|
"vperm %%v22,%%v22,%%v22,%%v1\n\t"
|
||||||
"vlef %%v21,4(%%r1,%1),1 \n\t"
|
"vl %%v0,0(%%r1,%[y])\n\t"
|
||||||
"vlef %%v21,12(%%r1,%1),2 \n\t"
|
|
||||||
"vlef %%v21,12(%%r1,%1),3 \n\t"
|
|
||||||
"vlef %%v22,0(%%r1,%2),0 \n\t"
|
|
||||||
"vlef %%v22,0(%%r1,%2),1 \n\t"
|
|
||||||
"vlef %%v22,8(%%r1,%2),2 \n\t"
|
|
||||||
"vlef %%v22,8(%%r1,%2),3 \n\t"
|
|
||||||
"vlef %%v23,4(%%r1,%2),0 \n\t"
|
|
||||||
"vlef %%v23,4(%%r1,%2),1 \n\t"
|
|
||||||
"vlef %%v23,12(%%r1,%2),2 \n\t"
|
|
||||||
"vlef %%v23,12(%%r1,%2),3 \n\t"
|
|
||||||
|
|
||||||
"vl %%v0,0(%%r1,%4) \n\t"
|
|
||||||
"vfmasb %%v0,%%v20,%%v16,%%v0\n\t"
|
"vfmasb %%v0,%%v20,%%v16,%%v0\n\t"
|
||||||
"vfmasb %%v0,%%v21,%%v18,%%v0\n\t"
|
"vfmasb %%v0,%%v21,%%v18,%%v0\n\t"
|
||||||
"vfmasb %%v0,%%v22,%%v17,%%v0\n\t"
|
"vfmasb %%v0,%%v22,%%v17,%%v0\n\t"
|
||||||
"vfmasb %%v0,%%v23,%%v19,%%v0\n\t"
|
"vfmasb %%v0,%%v23,%%v19,%%v0\n\t"
|
||||||
"vst %%v0,0(%%r1,%4) \n\t"
|
"vst %%v0,0(%%r1,%[y])\n\t"
|
||||||
|
|
||||||
"agfi %%r1,16\n\t"
|
"agfi %%r1,16\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
|
||||||
);
|
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x)
|
||||||
|
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20",
|
||||||
|
"v21", "v22", "v23");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||||
{
|
__asm__("vlrepg %%v16,0(%[x])\n\t"
|
||||||
__asm__ volatile (
|
|
||||||
"vlrepg %%v16,0(%2) \n\t"
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vlef %%v17,4(%2),0 \n\t"
|
"vlef %%v17,4(%[x]),0\n\t"
|
||||||
"vlef %%v17,4(%2),2 \n\t"
|
"vlef %%v17,4(%[x]),2\n\t"
|
||||||
"vflcsb %%v17,%%v17\n\t"
|
"vflcsb %%v17,%%v17\n\t"
|
||||||
"vlef %%v17,0(%2),1 \n\t"
|
"vlef %%v17,0(%[x]),1\n\t"
|
||||||
"vlef %%v17,0(%2),3 \n\t"
|
"vlef %%v17,0(%[x]),3\n\t"
|
||||||
#else
|
#else
|
||||||
"vlef %%v17,0(%2),1 \n\t"
|
"vlef %%v17,0(%[x]),1\n\t"
|
||||||
"vlef %%v17,0(%2),3 \n\t"
|
"vlef %%v17,0(%[x]),3\n\t"
|
||||||
"vflcsb %%v17,%%v17\n\t"
|
"vflcsb %%v17,%%v17\n\t"
|
||||||
"vlef %%v17,4(%2),0 \n\t"
|
"vlef %%v17,4(%[x]),0\n\t"
|
||||||
"vlef %%v17,4(%2),2 \n\t"
|
"vlef %%v17,4(%[x]),2\n\t"
|
||||||
#endif
|
#endif
|
||||||
|
"vleib %%v1,0,0\n\t"
|
||||||
|
"vleib %%v1,1,1\n\t"
|
||||||
|
"vleib %%v1,2,2\n\t"
|
||||||
|
"vleib %%v1,3,3\n\t"
|
||||||
|
"vleib %%v1,0,4\n\t"
|
||||||
|
"vleib %%v1,1,5\n\t"
|
||||||
|
"vleib %%v1,2,6\n\t"
|
||||||
|
"vleib %%v1,3,7\n\t"
|
||||||
|
"vleib %%v1,8,8\n\t"
|
||||||
|
"vleib %%v1,9,9\n\t"
|
||||||
|
"vleib %%v1,10,10\n\t"
|
||||||
|
"vleib %%v1,11,11\n\t"
|
||||||
|
"vleib %%v1,8,12\n\t"
|
||||||
|
"vleib %%v1,9,13\n\t"
|
||||||
|
"vleib %%v1,10,14\n\t"
|
||||||
|
"vleib %%v1,11,15\n\t"
|
||||||
|
"vleib %%v2,4,0\n\t"
|
||||||
|
"vleib %%v2,5,1\n\t"
|
||||||
|
"vleib %%v2,6,2\n\t"
|
||||||
|
"vleib %%v2,7,3\n\t"
|
||||||
|
"vleib %%v2,4,4\n\t"
|
||||||
|
"vleib %%v2,5,5\n\t"
|
||||||
|
"vleib %%v2,6,6\n\t"
|
||||||
|
"vleib %%v2,7,7\n\t"
|
||||||
|
"vleib %%v2,12,8\n\t"
|
||||||
|
"vleib %%v2,13,9\n\t"
|
||||||
|
"vleib %%v2,14,10\n\t"
|
||||||
|
"vleib %%v2,15,11\n\t"
|
||||||
|
"vleib %%v2,12,12\n\t"
|
||||||
|
"vleib %%v2,13,13\n\t"
|
||||||
|
"vleib %%v2,14,14\n\t"
|
||||||
|
"vleib %%v2,15,15\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,1 \n\t"
|
"srlg %[n],%[n],1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap])\n\t"
|
||||||
"pfd 2,1024(%%r1,%3) \n\t"
|
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v18,0(%%r1,%[ap])\n\t"
|
||||||
"vlef %%v18,0(%%r1,%1),0 \n\t"
|
"vperm %%v19,%%v18,%%v18,%%v2\n\t"
|
||||||
"vlef %%v18,0(%%r1,%1),1 \n\t"
|
"vperm %%v18,%%v18,%%v18,%%v1\n\t"
|
||||||
"vlef %%v18,8(%%r1,%1),2 \n\t"
|
"vl %%v0,0(%%r1,%[y])\n\t"
|
||||||
"vlef %%v18,8(%%r1,%1),3 \n\t"
|
|
||||||
"vlef %%v19,4(%%r1,%1),0 \n\t"
|
|
||||||
"vlef %%v19,4(%%r1,%1),1 \n\t"
|
|
||||||
"vlef %%v19,12(%%r1,%1),2 \n\t"
|
|
||||||
"vlef %%v19,12(%%r1,%1),3 \n\t"
|
|
||||||
|
|
||||||
"vl %%v0,0(%%r1,%3) \n\t"
|
|
||||||
"vfmasb %%v0,%%v18,%%v16,%%v0\n\t"
|
"vfmasb %%v0,%%v18,%%v16,%%v0\n\t"
|
||||||
"vfmasb %%v0,%%v19,%%v17,%%v0\n\t"
|
"vfmasb %%v0,%%v19,%%v17,%%v0\n\t"
|
||||||
"vst %%v0,0(%%r1,%3) \n\t"
|
"vst %%v0,0(%%r1,%[y])\n\t"
|
||||||
|
|
||||||
"agfi %%r1,16\n\t"
|
"agfi %%r1,16\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19"
|
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i)
|
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r,
|
||||||
{
|
FLOAT alpha_i) {
|
||||||
__asm__ volatile (
|
__asm__(
|
||||||
#if !defined(XCONJ)
|
#if !defined(XCONJ)
|
||||||
"vlrepf %%v0,%3 \n\t"
|
"vlrepf %%v0,%[alpha_r]\n\t"
|
||||||
"vlef %%v1,%4,0 \n\t"
|
"vlef %%v1,%[alpha_i],0\n\t"
|
||||||
"vlef %%v1,%4,2 \n\t"
|
"vlef %%v1,%[alpha_i],2\n\t"
|
||||||
"vflcsb %%v1,%%v1\n\t"
|
"vflcsb %%v1,%%v1\n\t"
|
||||||
"vlef %%v1,%4,1 \n\t"
|
"vlef %%v1,%[alpha_i],1\n\t"
|
||||||
"vlef %%v1,%4,3 \n\t"
|
"vlef %%v1,%[alpha_i],3\n\t"
|
||||||
#else
|
#else
|
||||||
"vlef %%v0,%3,1 \n\t"
|
"vlef %%v0,%[alpha_r],1\n\t"
|
||||||
"vlef %%v0,%3,3 \n\t"
|
"vlef %%v0,%[alpha_r],3\n\t"
|
||||||
"vflcsb %%v0,%%v0\n\t"
|
"vflcsb %%v0,%%v0\n\t"
|
||||||
"vlef %%v0,%3,0 \n\t"
|
"vlef %%v0,%[alpha_r],0\n\t"
|
||||||
"vlef %%v0,%3,2 \n\t"
|
"vlef %%v0,%[alpha_r],2\n\t"
|
||||||
"vlrepf %%v1,%4 \n\t"
|
"vlrepf %%v1,%[alpha_i]\n\t"
|
||||||
#endif
|
#endif
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,2 \n\t"
|
"srlg %[n],%[n],2\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[src])\n\t"
|
||||||
"pfd 2,1024(%%r1,%2) \n\t"
|
"pfd 2,1024(%%r1,%[dest])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[src])\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v17,16(%%r1,%[src])\n\t"
|
||||||
"vl %%v17,16(%%r1,%1) \n\t"
|
"vl %%v18,0(%%r1,%[dest])\n\t"
|
||||||
"vl %%v18,0(%%r1,%2) \n\t"
|
"vl %%v19,16(%%r1,%[dest])\n\t"
|
||||||
"vl %%v19,16(%%r1,%2) \n\t"
|
|
||||||
"verllg %%v20,%%v16,32\n\t"
|
"verllg %%v20,%%v16,32\n\t"
|
||||||
"verllg %%v21,%%v17,32\n\t"
|
"verllg %%v21,%%v17,32\n\t"
|
||||||
|
|
||||||
"vfmasb %%v22,%%v16,%%v0,%%v18\n\t"
|
"vfmasb %%v22,%%v16,%%v0,%%v18\n\t"
|
||||||
"vfmasb %%v23,%%v17,%%v0,%%v19\n\t"
|
"vfmasb %%v23,%%v17,%%v0,%%v19\n\t"
|
||||||
|
|
||||||
"vfmasb %%v22,%%v20,%%v1,%%v22\n\t"
|
"vfmasb %%v22,%%v20,%%v1,%%v22\n\t"
|
||||||
"vfmasb %%v23,%%v21,%%v1,%%v23\n\t"
|
"vfmasb %%v23,%%v21,%%v1,%%v23\n\t"
|
||||||
|
"vst %%v22,0(%%r1,%[dest])\n\t"
|
||||||
"vst %%v22,0(%%r1,%2) \n\t"
|
"vst %%v23,16(%%r1,%[dest])\n\t"
|
||||||
"vst %%v23,16(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,32\n\t"
|
"agfi %%r1,32\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i)
|
: [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src),
|
||||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23"
|
[src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i)
|
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,
|
||||||
{
|
FLOAT alpha_r, FLOAT alpha_i) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
|
|
||||||
if ( inc_dest != 2 )
|
if (inc_dest != 2) {
|
||||||
{
|
|
||||||
|
|
||||||
FLOAT temp_r;
|
FLOAT temp_r;
|
||||||
FLOAT temp_i;
|
FLOAT temp_i;
|
||||||
for ( i=0; i<n; i++ )
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
#if !defined(XCONJ)
|
#if !defined(XCONJ)
|
||||||
temp_r = alpha_r * src[0] - alpha_i * src[1];
|
temp_r = alpha_r * src[0] - alpha_i * src[1];
|
||||||
temp_i = alpha_r * src[1] + alpha_i * src[0];
|
temp_i = alpha_r * src[1] + alpha_i * src[0];
|
||||||
|
|
@ -343,8 +386,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
|
||||||
add_y_4(n, src, dest, alpha_r, alpha_i);
|
add_y_4(n, src, dest, alpha_r, alpha_i);
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
{
|
FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||||
|
BLASLONG inc_y, FLOAT *buffer) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
FLOAT *a_ptr;
|
FLOAT *a_ptr;
|
||||||
FLOAT *x_ptr;
|
FLOAT *x_ptr;
|
||||||
|
|
@ -358,8 +402,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
BLASLONG lda4;
|
BLASLONG lda4;
|
||||||
FLOAT xbuffer[8], *ybuffer;
|
FLOAT xbuffer[8], *ybuffer;
|
||||||
|
|
||||||
if ( m < 1 ) return(0);
|
if (m < 1)
|
||||||
if ( n < 1 ) return(0);
|
return (0);
|
||||||
|
if (n < 1)
|
||||||
|
return (0);
|
||||||
|
|
||||||
ybuffer = buffer;
|
ybuffer = buffer;
|
||||||
|
|
||||||
|
|
@ -379,13 +425,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
|
|
||||||
BLASLONG NB = NBMAX;
|
BLASLONG NB = NBMAX;
|
||||||
|
|
||||||
while ( NB == NBMAX )
|
while (NB == NBMAX) {
|
||||||
{
|
|
||||||
|
|
||||||
m1 -= NB;
|
m1 -= NB;
|
||||||
if ( m1 < 0)
|
if (m1 < 0) {
|
||||||
{
|
if (m2 == 0)
|
||||||
if ( m2 == 0 ) break;
|
break;
|
||||||
NB = m2;
|
NB = m2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -398,11 +443,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
//zero_y(NB,ybuffer);
|
//zero_y(NB,ybuffer);
|
||||||
memset(ybuffer, 0, NB * 8);
|
memset(ybuffer, 0, NB * 8);
|
||||||
|
|
||||||
if ( inc_x == 2 )
|
if (inc_x == 2) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n1 ; i++)
|
for (i = 0; i < n1; i++) {
|
||||||
{
|
|
||||||
cgemv_kernel_4x4(NB, ap, x_ptr, ybuffer);
|
cgemv_kernel_4x4(NB, ap, x_ptr, ybuffer);
|
||||||
ap[0] += lda4;
|
ap[0] += lda4;
|
||||||
ap[1] += lda4;
|
ap[1] += lda4;
|
||||||
|
|
@ -412,27 +455,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += 8;
|
x_ptr += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( n2 & 2 )
|
if (n2 & 2) {
|
||||||
{
|
|
||||||
cgemv_kernel_4x2(NB, ap, x_ptr, ybuffer);
|
cgemv_kernel_4x2(NB, ap, x_ptr, ybuffer);
|
||||||
x_ptr += 4;
|
x_ptr += 4;
|
||||||
a_ptr += 2 * lda;
|
a_ptr += 2 * lda;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( n2 & 1 )
|
if (n2 & 1) {
|
||||||
{
|
|
||||||
cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
|
cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
|
||||||
/* x_ptr += 2;
|
/* x_ptr += 2;
|
||||||
a_ptr += lda; */
|
a_ptr += lda; */
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n1 ; i++)
|
for (i = 0; i < n1; i++) {
|
||||||
{
|
|
||||||
|
|
||||||
xbuffer[0] = x_ptr[0];
|
xbuffer[0] = x_ptr[0];
|
||||||
xbuffer[1] = x_ptr[1];
|
xbuffer[1] = x_ptr[1];
|
||||||
|
|
@ -455,8 +493,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
a_ptr += lda4;
|
a_ptr += lda4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for( i = 0; i < n2 ; i++)
|
for (i = 0; i < n2; i++) {
|
||||||
{
|
|
||||||
xbuffer[0] = x_ptr[0];
|
xbuffer[0] = x_ptr[0];
|
||||||
xbuffer[1] = x_ptr[1];
|
xbuffer[1] = x_ptr[1];
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
|
|
@ -472,21 +509,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
y_ptr += NB * inc_y;
|
y_ptr += NB * inc_y;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( m3 == 0 ) return(0);
|
if (m3 == 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( m3 == 1 )
|
if (m3 == 1) {
|
||||||
{
|
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
FLOAT temp_r = 0.0;
|
FLOAT temp_r = 0.0;
|
||||||
FLOAT temp_i = 0.0;
|
FLOAT temp_i = 0.0;
|
||||||
|
|
||||||
if ( lda == 2 && inc_x == 2 )
|
if (lda == 2 && inc_x == 2) {
|
||||||
{
|
|
||||||
|
|
||||||
|
for (i = 0; i < (n & -2); i += 2) {
|
||||||
for( i=0 ; i < (n & -2); i+=2 )
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
@ -503,10 +537,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += 4;
|
x_ptr += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (; i < n; i++) {
|
||||||
|
|
||||||
for( ; i < n; i++ )
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
@ -519,13 +550,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += 2;
|
x_ptr += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
for (i = 0; i < n; i++) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n; i++ )
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
@ -549,8 +576,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( m3 == 2 )
|
if (m3 == 2) {
|
||||||
{
|
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
FLOAT temp_r0 = 0.0;
|
FLOAT temp_r0 = 0.0;
|
||||||
|
|
@ -558,11 +584,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
FLOAT temp_r1 = 0.0;
|
FLOAT temp_r1 = 0.0;
|
||||||
FLOAT temp_i1 = 0.0;
|
FLOAT temp_i1 = 0.0;
|
||||||
|
|
||||||
if ( lda == 4 && inc_x == 2 )
|
if (lda == 4 && inc_x == 2) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < (n & -2); i+=2 )
|
for (i = 0; i < (n & -2); i += 2) {
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
|
|
||||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
|
|
@ -592,9 +616,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += 4;
|
x_ptr += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (; i < n; i++) {
|
||||||
for( ; i < n; i++ )
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
@ -611,13 +633,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += 2;
|
x_ptr += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
for (i = 0; i < n; i++) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i=0 ; i < n; i++ )
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
@ -634,7 +652,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
#if !defined(XCONJ)
|
#if !defined(XCONJ)
|
||||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||||
|
|
@ -652,9 +669,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m3 == 3) {
|
||||||
if ( m3 == 3 )
|
|
||||||
{
|
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
FLOAT temp_r0 = 0.0;
|
FLOAT temp_r0 = 0.0;
|
||||||
|
|
@ -664,11 +679,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
FLOAT temp_r2 = 0.0;
|
FLOAT temp_r2 = 0.0;
|
||||||
FLOAT temp_i2 = 0.0;
|
FLOAT temp_i2 = 0.0;
|
||||||
|
|
||||||
if ( lda == 6 && inc_x == 2 )
|
if (lda == 6 && inc_x == 2) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i=0 ; i < n; i++ )
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
@ -689,13 +702,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += 2;
|
x_ptr += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
for (i = 0; i < n; i++) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n; i++ )
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2014, The OpenBLAS Project
|
Copyright (c) 2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -29,84 +29,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define NBMAX 2048
|
#define NBMAX 2048
|
||||||
|
|
||||||
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
|
||||||
{
|
FLOAT *alpha) {
|
||||||
__asm__ volatile (
|
register FLOAT *ap0 = ap[0];
|
||||||
"vzero %%v16 \n\t"
|
register FLOAT *ap1 = ap[1];
|
||||||
|
register FLOAT *ap2 = ap[2];
|
||||||
|
register FLOAT *ap3 = ap[3];
|
||||||
|
|
||||||
|
__asm__("vzero %%v16\n\t"
|
||||||
"vzero %%v17\n\t"
|
"vzero %%v17\n\t"
|
||||||
"vzero %%v18\n\t"
|
"vzero %%v18\n\t"
|
||||||
"vzero %%v19\n\t"
|
"vzero %%v19\n\t"
|
||||||
|
"vzero %%v20\n\t"
|
||||||
|
"vzero %%v21\n\t"
|
||||||
|
"vzero %%v22\n\t"
|
||||||
|
"vzero %%v23\n\t"
|
||||||
|
"vleib %%v2,0,0\n\t"
|
||||||
|
"vleib %%v2,1,1\n\t"
|
||||||
|
"vleib %%v2,2,2\n\t"
|
||||||
|
"vleib %%v2,3,3\n\t"
|
||||||
|
"vleib %%v2,0,4\n\t"
|
||||||
|
"vleib %%v2,1,5\n\t"
|
||||||
|
"vleib %%v2,2,6\n\t"
|
||||||
|
"vleib %%v2,3,7\n\t"
|
||||||
|
"vleib %%v2,8,8\n\t"
|
||||||
|
"vleib %%v2,9,9\n\t"
|
||||||
|
"vleib %%v2,10,10\n\t"
|
||||||
|
"vleib %%v2,11,11\n\t"
|
||||||
|
"vleib %%v2,8,12\n\t"
|
||||||
|
"vleib %%v2,9,13\n\t"
|
||||||
|
"vleib %%v2,10,14\n\t"
|
||||||
|
"vleib %%v2,11,15\n\t"
|
||||||
|
"vleib %%v3,4,0\n\t"
|
||||||
|
"vleib %%v3,5,1\n\t"
|
||||||
|
"vleib %%v3,6,2\n\t"
|
||||||
|
"vleib %%v3,7,3\n\t"
|
||||||
|
"vleib %%v3,4,4\n\t"
|
||||||
|
"vleib %%v3,5,5\n\t"
|
||||||
|
"vleib %%v3,6,6\n\t"
|
||||||
|
"vleib %%v3,7,7\n\t"
|
||||||
|
"vleib %%v3,12,8\n\t"
|
||||||
|
"vleib %%v3,13,9\n\t"
|
||||||
|
"vleib %%v3,14,10\n\t"
|
||||||
|
"vleib %%v3,15,11\n\t"
|
||||||
|
"vleib %%v3,12,12\n\t"
|
||||||
|
"vleib %%v3,13,13\n\t"
|
||||||
|
"vleib %%v3,14,14\n\t"
|
||||||
|
"vleib %%v3,15,15\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,1 \n\t"
|
"srlg %[n],%[n],1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||||
"pfd 1,1024(%%r1,%3) \n\t"
|
"pfd 1,1024(%%r1,%[ap2])\n\t"
|
||||||
"pfd 1,1024(%%r1,%4) \n\t"
|
"pfd 1,1024(%%r1,%[ap3])\n\t"
|
||||||
"pfd 1,1024(%%r1,%5) \n\t"
|
"pfd 1,1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v0,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,0(%%r1,%5) \n\t"
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vlef %%v21,4(%%r1,%5),0 \n\t"
|
"vlef %%v1,4(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v21,12(%%r1,%5),2 \n\t"
|
"vlef %%v1,12(%%r1,%[x]),2\n\t"
|
||||||
"vflcsb %%v21,%%v21 \n\t"
|
"vflcsb %%v1,%%v1\n\t"
|
||||||
"vlef %%v21,0(%%r1,%5),1 \n\t"
|
"vlef %%v1,0(%%r1,%[x]),1\n\t"
|
||||||
"vlef %%v21,8(%%r1,%5),3 \n\t"
|
"vlef %%v1,8(%%r1,%[x]),3\n\t"
|
||||||
#else
|
#else
|
||||||
"vlef %%v21,0(%%r1,%5),1 \n\t"
|
"vlef %%v1,0(%%r1,%[x]),1\n\t"
|
||||||
"vlef %%v21,8(%%r1,%5),3 \n\t"
|
"vlef %%v1,8(%%r1,%[x]),3\n\t"
|
||||||
"vflcsb %%v21,%%v21 \n\t"
|
"vflcsb %%v1,%%v1\n\t"
|
||||||
"vlef %%v21,4(%%r1,%5),0 \n\t"
|
"vlef %%v1,4(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v21,12(%%r1,%5),2 \n\t"
|
"vlef %%v1,12(%%r1,%[x]),2\n\t"
|
||||||
#endif
|
#endif
|
||||||
|
"vl %%v24,0(%%r1,%[ap0])\n\t"
|
||||||
"vlef %%v22,0(%%r1,%1),0 \n\t"
|
"vperm %%v25,%%v24,%%v24,%%v3\n\t"
|
||||||
"vlef %%v22,0(%%r1,%1),1 \n\t"
|
"vperm %%v24,%%v24,%%v24,%%v2\n\t"
|
||||||
"vlef %%v22,8(%%r1,%1),2 \n\t"
|
"vl %%v26,0(%%r1,%[ap1])\n\t"
|
||||||
"vlef %%v22,8(%%r1,%1),3 \n\t"
|
"vperm %%v27,%%v26,%%v26,%%v3\n\t"
|
||||||
"vlef %%v23,4(%%r1,%1),0 \n\t"
|
"vperm %%v26,%%v26,%%v26,%%v2\n\t"
|
||||||
"vlef %%v23,4(%%r1,%1),1 \n\t"
|
"vl %%v28,0(%%r1,%[ap2])\n\t"
|
||||||
"vlef %%v23,12(%%r1,%1),2 \n\t"
|
"vperm %%v29,%%v28,%%v28,%%v3\n\t"
|
||||||
"vlef %%v23,12(%%r1,%1),3 \n\t"
|
"vperm %%v28,%%v28,%%v28,%%v2\n\t"
|
||||||
"vlef %%v24,0(%%r1,%2),0 \n\t"
|
"vl %%v30,0(%%r1,%[ap3])\n\t"
|
||||||
"vlef %%v24,0(%%r1,%2),1 \n\t"
|
"vperm %%v31,%%v30,%%v30,%%v3\n\t"
|
||||||
"vlef %%v24,8(%%r1,%2),2 \n\t"
|
"vperm %%v30,%%v30,%%v30,%%v2\n\t"
|
||||||
"vlef %%v24,8(%%r1,%2),3 \n\t"
|
"vfmasb %%v16,%%v24,%%v0,%%v16\n\t"
|
||||||
"vlef %%v25,4(%%r1,%2),0 \n\t"
|
"vfmasb %%v20,%%v25,%%v1,%%v20\n\t"
|
||||||
"vlef %%v25,4(%%r1,%2),1 \n\t"
|
"vfmasb %%v17,%%v26,%%v0,%%v17\n\t"
|
||||||
"vlef %%v25,12(%%r1,%2),2 \n\t"
|
"vfmasb %%v21,%%v27,%%v1,%%v21\n\t"
|
||||||
"vlef %%v25,12(%%r1,%2),3 \n\t"
|
"vfmasb %%v18,%%v28,%%v0,%%v18\n\t"
|
||||||
|
"vfmasb %%v22,%%v29,%%v1,%%v22\n\t"
|
||||||
"vfmasb %%v16,%%v22,%%v20,%%v16 \n\t"
|
"vfmasb %%v19,%%v30,%%v0,%%v19\n\t"
|
||||||
"vfmasb %%v16,%%v23,%%v21,%%v16 \n\t"
|
"vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
|
||||||
"vfmasb %%v17,%%v24,%%v20,%%v17 \n\t"
|
|
||||||
"vfmasb %%v17,%%v25,%%v21,%%v17 \n\t"
|
|
||||||
|
|
||||||
"vlef %%v26,0(%%r1,%3),0 \n\t"
|
|
||||||
"vlef %%v26,0(%%r1,%3),1 \n\t"
|
|
||||||
"vlef %%v26,8(%%r1,%3),2 \n\t"
|
|
||||||
"vlef %%v26,8(%%r1,%3),3 \n\t"
|
|
||||||
"vlef %%v27,4(%%r1,%3),0 \n\t"
|
|
||||||
"vlef %%v27,4(%%r1,%3),1 \n\t"
|
|
||||||
"vlef %%v27,12(%%r1,%3),2 \n\t"
|
|
||||||
"vlef %%v27,12(%%r1,%3),3 \n\t"
|
|
||||||
"vlef %%v28,0(%%r1,%4),0 \n\t"
|
|
||||||
"vlef %%v28,0(%%r1,%4),1 \n\t"
|
|
||||||
"vlef %%v28,8(%%r1,%4),2 \n\t"
|
|
||||||
"vlef %%v28,8(%%r1,%4),3 \n\t"
|
|
||||||
"vlef %%v29,4(%%r1,%4),0 \n\t"
|
|
||||||
"vlef %%v29,4(%%r1,%4),1 \n\t"
|
|
||||||
"vlef %%v29,12(%%r1,%4),2 \n\t"
|
|
||||||
"vlef %%v29,12(%%r1,%4),3 \n\t"
|
|
||||||
|
|
||||||
"vfmasb %%v18,%%v26,%%v20,%%v18 \n\t"
|
|
||||||
"vfmasb %%v18,%%v27,%%v21,%%v18 \n\t"
|
|
||||||
"vfmasb %%v19,%%v28,%%v20,%%v19 \n\t"
|
|
||||||
"vfmasb %%v19,%%v29,%%v21,%%v19 \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,16\n\t"
|
"agfi %%r1,16\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
|
"vfasb %%v16,%%v16,%%v20\n\t"
|
||||||
|
"vfasb %%v17,%%v17,%%v21\n\t"
|
||||||
|
"vfasb %%v18,%%v18,%%v22\n\t"
|
||||||
|
"vfasb %%v19,%%v19,%%v23\n\t"
|
||||||
"vrepg %%v20,%%v16,1\n\t"
|
"vrepg %%v20,%%v16,1\n\t"
|
||||||
"vrepg %%v21,%%v17,1\n\t"
|
"vrepg %%v21,%%v17,1\n\t"
|
||||||
"vrepg %%v22,%%v18,1\n\t"
|
"vrepg %%v22,%%v18,1\n\t"
|
||||||
|
|
@ -120,86 +137,115 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *
|
||||||
"verllg %%v18,%%v16,32\n\t"
|
"verllg %%v18,%%v16,32\n\t"
|
||||||
"verllg %%v19,%%v17,32\n\t"
|
"verllg %%v19,%%v17,32\n\t"
|
||||||
#if !defined(XCONJ)
|
#if !defined(XCONJ)
|
||||||
"vlrepf %%v20,0(%7) \n\t"
|
"vlrepf %%v20,0(%[alpha])\n\t"
|
||||||
"vlef %%v21,4(%7),0 \n\t"
|
"vlef %%v21,4(%[alpha]),0\n\t"
|
||||||
"vlef %%v21,4(%7),2 \n\t"
|
"vlef %%v21,4(%[alpha]),2\n\t"
|
||||||
"vflcsb %%v21,%%v21\n\t"
|
"vflcsb %%v21,%%v21\n\t"
|
||||||
"vlef %%v21,4(%7),1 \n\t"
|
"vlef %%v21,4(%[alpha]),1\n\t"
|
||||||
"vlef %%v21,4(%7),3 \n\t"
|
"vlef %%v21,4(%[alpha]),3\n\t"
|
||||||
#else
|
#else
|
||||||
"vlef %%v20,0(%7),1 \n\t"
|
"vlef %%v20,0(%[alpha]),1\n\t"
|
||||||
"vlef %%v20,0(%7),3 \n\t"
|
"vlef %%v20,0(%[alpha]),3\n\t"
|
||||||
"vflcsb %%v20,%%v20\n\t"
|
"vflcsb %%v20,%%v20\n\t"
|
||||||
"vlef %%v20,0(%7),0 \n\t"
|
"vlef %%v20,0(%[alpha]),0\n\t"
|
||||||
"vlef %%v20,0(%7),2 \n\t"
|
"vlef %%v20,0(%[alpha]),2\n\t"
|
||||||
"vlrepf %%v21,4(%7) \n\t"
|
"vlrepf %%v21,4(%[alpha])\n\t"
|
||||||
#endif
|
#endif
|
||||||
"vl %%v22,0(%6) \n\t"
|
"vl %%v22,0(%[y])\n\t"
|
||||||
"vl %%v23,16(%6) \n\t"
|
"vl %%v23,16(%[y])\n\t"
|
||||||
"vfmasb %%v22,%%v16,%%v20,%%v22\n\t"
|
"vfmasb %%v22,%%v16,%%v20,%%v22\n\t"
|
||||||
"vfmasb %%v22,%%v18,%%v21,%%v22\n\t"
|
"vfmasb %%v22,%%v18,%%v21,%%v22\n\t"
|
||||||
"vfmasb %%v23,%%v17,%%v20,%%v23\n\t"
|
"vfmasb %%v23,%%v17,%%v20,%%v23\n\t"
|
||||||
"vfmasb %%v23,%%v19,%%v21,%%v23\n\t"
|
"vfmasb %%v23,%%v19,%%v21,%%v23\n\t"
|
||||||
"vst %%v22,0(%6) \n\t"
|
"vst %%v22,0(%[y])\n\t"
|
||||||
"vst %%v23,16(%6) "
|
"vst %%v23,16(%[y])"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[8])y),"ZQ"((const FLOAT (*)[2])alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
|
||||||
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29"
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
|
||||||
);
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
|
||||||
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
|
||||||
|
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||||
|
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
|
||||||
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
|
||||||
|
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
|
||||||
{
|
FLOAT *alpha) {
|
||||||
__asm__ volatile (
|
register FLOAT *ap0 = ap[0];
|
||||||
"vzero %%v16 \n\t"
|
register FLOAT *ap1 = ap[1];
|
||||||
|
|
||||||
|
__asm__("vzero %%v16\n\t"
|
||||||
"vzero %%v17\n\t"
|
"vzero %%v17\n\t"
|
||||||
|
"vzero %%v18\n\t"
|
||||||
|
"vzero %%v19\n\t"
|
||||||
|
"vleib %%v2,0,0\n\t"
|
||||||
|
"vleib %%v2,1,1\n\t"
|
||||||
|
"vleib %%v2,2,2\n\t"
|
||||||
|
"vleib %%v2,3,3\n\t"
|
||||||
|
"vleib %%v2,0,4\n\t"
|
||||||
|
"vleib %%v2,1,5\n\t"
|
||||||
|
"vleib %%v2,2,6\n\t"
|
||||||
|
"vleib %%v2,3,7\n\t"
|
||||||
|
"vleib %%v2,8,8\n\t"
|
||||||
|
"vleib %%v2,9,9\n\t"
|
||||||
|
"vleib %%v2,10,10\n\t"
|
||||||
|
"vleib %%v2,11,11\n\t"
|
||||||
|
"vleib %%v2,8,12\n\t"
|
||||||
|
"vleib %%v2,9,13\n\t"
|
||||||
|
"vleib %%v2,10,14\n\t"
|
||||||
|
"vleib %%v2,11,15\n\t"
|
||||||
|
"vleib %%v3,4,0\n\t"
|
||||||
|
"vleib %%v3,5,1\n\t"
|
||||||
|
"vleib %%v3,6,2\n\t"
|
||||||
|
"vleib %%v3,7,3\n\t"
|
||||||
|
"vleib %%v3,4,4\n\t"
|
||||||
|
"vleib %%v3,5,5\n\t"
|
||||||
|
"vleib %%v3,6,6\n\t"
|
||||||
|
"vleib %%v3,7,7\n\t"
|
||||||
|
"vleib %%v3,12,8\n\t"
|
||||||
|
"vleib %%v3,13,9\n\t"
|
||||||
|
"vleib %%v3,14,10\n\t"
|
||||||
|
"vleib %%v3,15,11\n\t"
|
||||||
|
"vleib %%v3,12,12\n\t"
|
||||||
|
"vleib %%v3,13,13\n\t"
|
||||||
|
"vleib %%v3,14,14\n\t"
|
||||||
|
"vleib %%v3,15,15\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,1 \n\t"
|
"srlg %[n],%[n],1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||||
"pfd 1,1024(%%r1,%3) \n\t"
|
"pfd 1,1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v0,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,0(%%r1,%3) \n\t"
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vlef %%v19,4(%%r1,%3),0 \n\t"
|
"vlef %%v1,4(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v19,12(%%r1,%3),2 \n\t"
|
"vlef %%v1,12(%%r1,%[x]),2\n\t"
|
||||||
"vflcsb %%v19,%%v19 \n\t"
|
"vflcsb %%v1,%%v1\n\t"
|
||||||
"vlef %%v19,0(%%r1,%3),1 \n\t"
|
"vlef %%v1,0(%%r1,%[x]),1\n\t"
|
||||||
"vlef %%v19,8(%%r1,%3),3 \n\t"
|
"vlef %%v1,8(%%r1,%[x]),3\n\t"
|
||||||
#else
|
#else
|
||||||
"vlef %%v19,0(%%r1,%3),1 \n\t"
|
"vlef %%v1,0(%%r1,%[x]),1\n\t"
|
||||||
"vlef %%v19,8(%%r1,%3),3 \n\t"
|
"vlef %%v1,8(%%r1,%[x]),3\n\t"
|
||||||
"vflcsb %%v19,%%v19 \n\t"
|
"vflcsb %%v1,%%v1\n\t"
|
||||||
"vlef %%v19,4(%%r1,%3),0 \n\t"
|
"vlef %%v1,4(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v19,12(%%r1,%3),2 \n\t"
|
"vlef %%v1,12(%%r1,%[x]),2\n\t"
|
||||||
#endif
|
#endif
|
||||||
|
"vl %%v20,0(%%r1,%[ap0])\n\t"
|
||||||
"vlef %%v20,0(%%r1,%1),0 \n\t"
|
"vperm %%v21,%%v20,%%v20,%%v3\n\t"
|
||||||
"vlef %%v20,0(%%r1,%1),1 \n\t"
|
"vperm %%v20,%%v20,%%v20,%%v2\n\t"
|
||||||
"vlef %%v20,8(%%r1,%1),2 \n\t"
|
"vl %%v22,0(%%r1,%[ap1])\n\t"
|
||||||
"vlef %%v20,8(%%r1,%1),3 \n\t"
|
"vperm %%v23,%%v22,%%v22,%%v3\n\t"
|
||||||
"vlef %%v21,4(%%r1,%1),0 \n\t"
|
"vperm %%v22,%%v22,%%v22,%%v2\n\t"
|
||||||
"vlef %%v21,4(%%r1,%1),1 \n\t"
|
"vfmasb %%v16,%%v20,%%v0,%%v16\n\t"
|
||||||
"vlef %%v21,12(%%r1,%1),2 \n\t"
|
"vfmasb %%v18,%%v21,%%v1,%%v18\n\t"
|
||||||
"vlef %%v21,12(%%r1,%1),3 \n\t"
|
"vfmasb %%v17,%%v22,%%v0,%%v17\n\t"
|
||||||
"vlef %%v22,0(%%r1,%2),0 \n\t"
|
"vfmasb %%v19,%%v23,%%v1,%%v19\n\t"
|
||||||
"vlef %%v22,0(%%r1,%2),1 \n\t"
|
|
||||||
"vlef %%v22,8(%%r1,%2),2 \n\t"
|
|
||||||
"vlef %%v22,8(%%r1,%2),3 \n\t"
|
|
||||||
"vlef %%v23,4(%%r1,%2),0 \n\t"
|
|
||||||
"vlef %%v23,4(%%r1,%2),1 \n\t"
|
|
||||||
"vlef %%v23,12(%%r1,%2),2 \n\t"
|
|
||||||
"vlef %%v23,12(%%r1,%2),3 \n\t"
|
|
||||||
|
|
||||||
"vfmasb %%v16,%%v20,%%v18,%%v16 \n\t"
|
|
||||||
"vfmasb %%v16,%%v21,%%v19,%%v16 \n\t"
|
|
||||||
"vfmasb %%v17,%%v22,%%v18,%%v17 \n\t"
|
|
||||||
"vfmasb %%v17,%%v23,%%v19,%%v17 \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,16\n\t"
|
"agfi %%r1,16\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
|
"vfasb %%v16,%%v16,%%v18\n\t"
|
||||||
|
"vfasb %%v17,%%v17,%%v19\n\t"
|
||||||
"vrepg %%v18,%%v16,1\n\t"
|
"vrepg %%v18,%%v16,1\n\t"
|
||||||
"vrepg %%v19,%%v17,1\n\t"
|
"vrepg %%v19,%%v17,1\n\t"
|
||||||
"vfasb %%v16,%%v16,%%v18\n\t"
|
"vfasb %%v16,%%v16,%%v18\n\t"
|
||||||
|
|
@ -207,99 +253,124 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *
|
||||||
"vmrhg %%v16,%%v16,%%v17\n\t"
|
"vmrhg %%v16,%%v16,%%v17\n\t"
|
||||||
"verllg %%v17,%%v16,32\n\t"
|
"verllg %%v17,%%v16,32\n\t"
|
||||||
#if !defined(XCONJ)
|
#if !defined(XCONJ)
|
||||||
"vlrepf %%v18,0(%5) \n\t"
|
"vlrepf %%v18,0(%[alpha])\n\t"
|
||||||
"vlef %%v19,4(%5),0 \n\t"
|
"vlef %%v19,4(%[alpha]),0\n\t"
|
||||||
"vlef %%v19,4(%5),2 \n\t"
|
"vlef %%v19,4(%[alpha]),2\n\t"
|
||||||
"vflcsb %%v19,%%v19\n\t"
|
"vflcsb %%v19,%%v19\n\t"
|
||||||
"vlef %%v19,4(%5),1 \n\t"
|
"vlef %%v19,4(%[alpha]),1\n\t"
|
||||||
"vlef %%v19,4(%5),3 \n\t"
|
"vlef %%v19,4(%[alpha]),3\n\t"
|
||||||
#else
|
#else
|
||||||
"vlef %%v18,0(%5),1 \n\t"
|
"vlef %%v18,0(%[alpha]),1\n\t"
|
||||||
"vlef %%v18,0(%5),3 \n\t"
|
"vlef %%v18,0(%[alpha]),3\n\t"
|
||||||
"vflcsb %%v18,%%v18\n\t"
|
"vflcsb %%v18,%%v18\n\t"
|
||||||
"vlef %%v18,0(%5),0 \n\t"
|
"vlef %%v18,0(%[alpha]),0\n\t"
|
||||||
"vlef %%v18,0(%5),2 \n\t"
|
"vlef %%v18,0(%[alpha]),2\n\t"
|
||||||
"vlrepf %%v19,4(%5) \n\t"
|
"vlrepf %%v19,4(%[alpha])\n\t"
|
||||||
#endif
|
#endif
|
||||||
"vl %%v20,0(%4) \n\t"
|
"vl %%v20,0(%[y])\n\t"
|
||||||
"vfmasb %%v20,%%v16,%%v18,%%v20\n\t"
|
"vfmasb %%v20,%%v16,%%v18,%%v20\n\t"
|
||||||
"vfmasb %%v20,%%v17,%%v19,%%v20\n\t"
|
"vfmasb %%v20,%%v17,%%v19,%%v20\n\t"
|
||||||
"vst %%v20,0(%4) "
|
"vst %%v20,0(%[y])"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[4])y),"ZQ"((const FLOAT (*)[2])alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
|
||||||
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23"
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
|
||||||
);
|
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||||
|
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
|
||||||
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
|
||||||
|
"v21", "v22", "v23");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
|
||||||
{
|
FLOAT *alpha) {
|
||||||
__asm__ volatile (
|
__asm__("vzero %%v16\n\t"
|
||||||
"vzero %%v16 \n\t"
|
"vzero %%v17\n\t"
|
||||||
|
"vleib %%v2,0,0\n\t"
|
||||||
|
"vleib %%v2,1,1\n\t"
|
||||||
|
"vleib %%v2,2,2\n\t"
|
||||||
|
"vleib %%v2,3,3\n\t"
|
||||||
|
"vleib %%v2,0,4\n\t"
|
||||||
|
"vleib %%v2,1,5\n\t"
|
||||||
|
"vleib %%v2,2,6\n\t"
|
||||||
|
"vleib %%v2,3,7\n\t"
|
||||||
|
"vleib %%v2,8,8\n\t"
|
||||||
|
"vleib %%v2,9,9\n\t"
|
||||||
|
"vleib %%v2,10,10\n\t"
|
||||||
|
"vleib %%v2,11,11\n\t"
|
||||||
|
"vleib %%v2,8,12\n\t"
|
||||||
|
"vleib %%v2,9,13\n\t"
|
||||||
|
"vleib %%v2,10,14\n\t"
|
||||||
|
"vleib %%v2,11,15\n\t"
|
||||||
|
"vleib %%v3,4,0\n\t"
|
||||||
|
"vleib %%v3,5,1\n\t"
|
||||||
|
"vleib %%v3,6,2\n\t"
|
||||||
|
"vleib %%v3,7,3\n\t"
|
||||||
|
"vleib %%v3,4,4\n\t"
|
||||||
|
"vleib %%v3,5,5\n\t"
|
||||||
|
"vleib %%v3,6,6\n\t"
|
||||||
|
"vleib %%v3,7,7\n\t"
|
||||||
|
"vleib %%v3,12,8\n\t"
|
||||||
|
"vleib %%v3,13,9\n\t"
|
||||||
|
"vleib %%v3,14,10\n\t"
|
||||||
|
"vleib %%v3,15,11\n\t"
|
||||||
|
"vleib %%v3,12,12\n\t"
|
||||||
|
"vleib %%v3,13,13\n\t"
|
||||||
|
"vleib %%v3,14,14\n\t"
|
||||||
|
"vleib %%v3,15,15\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,1 \n\t"
|
"srlg %[n],%[n],1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v0,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,0(%%r1,%2) \n\t"
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vlef %%v18,4(%%r1,%2),0 \n\t"
|
"vlef %%v1,4(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v18,12(%%r1,%2),2 \n\t"
|
"vlef %%v1,12(%%r1,%[x]),2\n\t"
|
||||||
"vflcsb %%v18,%%v18 \n\t"
|
"vflcsb %%v1,%%v1\n\t"
|
||||||
"vlef %%v18,0(%%r1,%2),1 \n\t"
|
"vlef %%v1,0(%%r1,%[x]),1\n\t"
|
||||||
"vlef %%v18,8(%%r1,%2),3 \n\t"
|
"vlef %%v1,8(%%r1,%[x]),3\n\t"
|
||||||
#else
|
#else
|
||||||
"vlef %%v18,0(%%r1,%2),1 \n\t"
|
"vlef %%v1,0(%%r1,%[x]),1\n\t"
|
||||||
"vlef %%v18,8(%%r1,%2),3 \n\t"
|
"vlef %%v1,8(%%r1,%[x]),3\n\t"
|
||||||
"vflcsb %%v18,%%v18 \n\t"
|
"vflcsb %%v1,%%v1\n\t"
|
||||||
"vlef %%v18,4(%%r1,%2),0 \n\t"
|
"vlef %%v1,4(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v18,12(%%r1,%2),2 \n\t"
|
"vlef %%v1,12(%%r1,%[x]),2\n\t"
|
||||||
#endif
|
#endif
|
||||||
|
"vl %%v18,0(%%r1,%[ap])\n\t"
|
||||||
"vlef %%v19,0(%%r1,%1),0 \n\t"
|
"vperm %%v19,%%v18,%%v18,%%v3\n\t"
|
||||||
"vlef %%v19,0(%%r1,%1),1 \n\t"
|
"vperm %%v18,%%v18,%%v18,%%v2\n\t"
|
||||||
"vlef %%v19,8(%%r1,%1),2 \n\t"
|
"vfmasb %%v16,%%v18,%%v0,%%v16\n\t"
|
||||||
"vlef %%v19,8(%%r1,%1),3 \n\t"
|
"vfmasb %%v17,%%v19,%%v1,%%v17\n\t"
|
||||||
"vlef %%v20,4(%%r1,%1),0 \n\t"
|
|
||||||
"vlef %%v20,4(%%r1,%1),1 \n\t"
|
|
||||||
"vlef %%v20,12(%%r1,%1),2 \n\t"
|
|
||||||
"vlef %%v20,12(%%r1,%1),3 \n\t"
|
|
||||||
|
|
||||||
"vfmasb %%v16,%%v19,%%v17,%%v16 \n\t"
|
|
||||||
"vfmasb %%v16,%%v20,%%v18,%%v16 \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,16\n\t"
|
"agfi %%r1,16\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
|
"vfasb %%v16,%%v16,%%v17\n\t"
|
||||||
"vrepg %%v17,%%v16,1\n\t"
|
"vrepg %%v17,%%v16,1\n\t"
|
||||||
"vfasb %%v16,%%v16,%%v17\n\t"
|
"vfasb %%v16,%%v16,%%v17\n\t"
|
||||||
"verllg %%v17,%%v16,32\n\t"
|
"verllg %%v17,%%v16,32\n\t"
|
||||||
#if !defined(XCONJ)
|
#if !defined(XCONJ)
|
||||||
"vlrepf %%v18,0(%4) \n\t"
|
"vlrepf %%v18,0(%[alpha])\n\t"
|
||||||
"vlef %%v19,4(%4),0 \n\t"
|
"vlef %%v19,4(%[alpha]),0\n\t"
|
||||||
"vflcsb %%v19,%%v19\n\t"
|
"vflcsb %%v19,%%v19\n\t"
|
||||||
"vlef %%v19,4(%4),1 \n\t"
|
"vlef %%v19,4(%[alpha]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vlef %%v18,0(%4),1 \n\t"
|
"vlef %%v18,0(%[alpha]),1\n\t"
|
||||||
"vflcsb %%v18,%%v18\n\t"
|
"vflcsb %%v18,%%v18\n\t"
|
||||||
"vlef %%v18,0(%4),0 \n\t"
|
"vlef %%v18,0(%[alpha]),0\n\t"
|
||||||
"vlrepf %%v19,4(%4) \n\t"
|
"vlrepf %%v19,4(%[alpha])\n\t"
|
||||||
#endif
|
#endif
|
||||||
"vleg %%v20,0(%3),0 \n\t"
|
"vleg %%v0,0(%[y]),0\n\t"
|
||||||
"vfmasb %%v20,%%v16,%%v18,%%v20 \n\t"
|
"vfmasb %%v0,%%v16,%%v18,%%v0\n\t"
|
||||||
"vfmasb %%v20,%%v17,%%v19,%%v20 \n\t"
|
"vfmasb %%v0,%%v17,%%v19,%%v0\n\t"
|
||||||
"vsteg %%v20,0(%3),0 "
|
"vsteg %%v0,0(%[y]),0"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[2])y),"ZQ"((const FLOAT (*)[2])alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
|
||||||
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23"
|
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||||
);
|
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
|
||||||
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
|
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
||||||
{
|
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
for ( i=0; i<n; i++ )
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
*dest = *src;
|
*dest = *src;
|
||||||
*(dest + 1) = *(src + 1);
|
*(dest + 1) = *(src + 1);
|
||||||
dest += 2;
|
dest += 2;
|
||||||
|
|
@ -307,8 +378,9 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
{
|
FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||||
|
BLASLONG inc_y, FLOAT *buffer) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
BLASLONG j;
|
BLASLONG j;
|
||||||
FLOAT *a_ptr;
|
FLOAT *a_ptr;
|
||||||
|
|
@ -324,8 +396,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
FLOAT ybuffer[8], *xbuffer;
|
FLOAT ybuffer[8], *xbuffer;
|
||||||
FLOAT alpha[2];
|
FLOAT alpha[2];
|
||||||
|
|
||||||
if ( m < 1 ) return(0);
|
if (m < 1)
|
||||||
if ( n < 1 ) return(0);
|
return (0);
|
||||||
|
if (n < 1)
|
||||||
|
return (0);
|
||||||
|
|
||||||
inc_x <<= 1;
|
inc_x <<= 1;
|
||||||
inc_y <<= 1;
|
inc_y <<= 1;
|
||||||
|
|
@ -346,13 +420,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
|
||||||
BLASLONG NB = NBMAX;
|
BLASLONG NB = NBMAX;
|
||||||
|
|
||||||
while ( NB == NBMAX )
|
while (NB == NBMAX) {
|
||||||
{
|
|
||||||
|
|
||||||
m1 -= NB;
|
m1 -= NB;
|
||||||
if ( m1 < 0)
|
if (m1 < 0) {
|
||||||
{
|
if (m2 == 0)
|
||||||
if ( m2 == 0 ) break;
|
break;
|
||||||
NB = m2;
|
NB = m2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -368,11 +441,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
else
|
else
|
||||||
xbuffer = x_ptr;
|
xbuffer = x_ptr;
|
||||||
|
|
||||||
if ( inc_y == 2 )
|
if (inc_y == 2) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n1 ; i++)
|
for (i = 0; i < n1; i++) {
|
||||||
{
|
|
||||||
cgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha);
|
cgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha);
|
||||||
ap[0] += lda4;
|
ap[0] += lda4;
|
||||||
ap[1] += lda4;
|
ap[1] += lda4;
|
||||||
|
|
@ -383,28 +454,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( n2 & 2 )
|
if (n2 & 2) {
|
||||||
{
|
|
||||||
cgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha);
|
cgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha);
|
||||||
a_ptr += lda * 2;
|
a_ptr += lda * 2;
|
||||||
y_ptr += 4;
|
y_ptr += 4;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( n2 & 1 )
|
if (n2 & 1) {
|
||||||
{
|
|
||||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
|
cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
|
||||||
/* a_ptr += lda;
|
/* a_ptr += lda;
|
||||||
y_ptr += 2; */
|
y_ptr += 2; */
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n1 ; i++)
|
for (i = 0; i < n1; i++) {
|
||||||
{
|
|
||||||
memset(ybuffer, 0, sizeof(ybuffer));
|
memset(ybuffer, 0, sizeof(ybuffer));
|
||||||
cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha);
|
cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha);
|
||||||
ap[0] += lda4;
|
ap[0] += lda4;
|
||||||
|
|
@ -428,8 +494,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for( i = 0; i < n2 ; i++)
|
for (i = 0; i < n2; i++) {
|
||||||
{
|
|
||||||
memset(ybuffer, 0, sizeof(ybuffer));
|
memset(ybuffer, 0, sizeof(ybuffer));
|
||||||
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha);
|
cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha);
|
||||||
a_ptr += lda;
|
a_ptr += lda;
|
||||||
|
|
@ -444,17 +509,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
x += NB * inc_x;
|
x += NB * inc_x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m3 == 0)
|
||||||
|
return (0);
|
||||||
if ( m3 == 0 ) return(0);
|
|
||||||
|
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
j = 0;
|
j = 0;
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
y_ptr = y;
|
y_ptr = y;
|
||||||
|
|
||||||
if ( m3 == 3 )
|
if (m3 == 3) {
|
||||||
{
|
|
||||||
|
|
||||||
FLOAT temp_r;
|
FLOAT temp_r;
|
||||||
FLOAT temp_i;
|
FLOAT temp_i;
|
||||||
|
|
@ -466,8 +529,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
FLOAT x4 = x_ptr[0];
|
FLOAT x4 = x_ptr[0];
|
||||||
FLOAT x5 = x_ptr[1];
|
FLOAT x5 = x_ptr[1];
|
||||||
while ( j < n)
|
while (j < n) {
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||||
|
|
@ -500,9 +562,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m3 == 2) {
|
||||||
if ( m3 == 2 )
|
|
||||||
{
|
|
||||||
|
|
||||||
FLOAT temp_r;
|
FLOAT temp_r;
|
||||||
FLOAT temp_i;
|
FLOAT temp_i;
|
||||||
|
|
@ -516,8 +576,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
FLOAT ar = alpha[0];
|
FLOAT ar = alpha[0];
|
||||||
FLOAT ai = alpha[1];
|
FLOAT ai = alpha[1];
|
||||||
|
|
||||||
while ( j < ( n & -2 ))
|
while (j < (n & -2)) {
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||||
|
|
@ -560,9 +619,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
j += 2;
|
j += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
while (j < n) {
|
||||||
while ( j < n)
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||||
|
|
@ -592,9 +649,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m3 == 1) {
|
||||||
if ( m3 == 1 )
|
|
||||||
{
|
|
||||||
|
|
||||||
FLOAT temp_r;
|
FLOAT temp_r;
|
||||||
FLOAT temp_i;
|
FLOAT temp_i;
|
||||||
|
|
@ -605,8 +660,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
FLOAT ar = alpha[0];
|
FLOAT ar = alpha[0];
|
||||||
FLOAT ai = alpha[1];
|
FLOAT ai = alpha[1];
|
||||||
|
|
||||||
while ( j < ( n & -2 ))
|
while (j < (n & -2)) {
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||||
|
|
@ -641,8 +695,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
j += 2;
|
j += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
while ( j < n)
|
while (j < n) {
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,25 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
|
||||||
{
|
__asm__("vlrepf %%v0,%[c]\n\t"
|
||||||
__asm__ (
|
"vlrepf %%v1,%[s]\n\t"
|
||||||
"vlrepf %%v0,%3 \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"vlrepf %%v1,%4 \n\t"
|
|
||||||
"srlg %%r0,%0,5 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
"vl %%v24, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
"vl %%v25, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
"vl %%v26, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
"vl %%v27, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
"vl %%v16, 0(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
"vl %%v17, 16(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
"vl %%v18, 32(%%r1,%[y])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
"vl %%v19, 48(%%r1,%[y])\n\t"
|
||||||
|
|
||||||
"vfmsb %%v28,%%v24,%%v0\n\t"
|
"vfmsb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmsb %%v29,%%v25,%%v0\n\t"
|
"vfmsb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -63,25 +60,22 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 0(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
"vst %%v29, 16(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
"vst %%v30, 32(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
"vst %%v31, 48(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
"vst %%v20, 0(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
"vst %%v21, 16(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
"vst %%v22, 32(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
"vst %%v23, 48(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
"vl %%v24, 64(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
"vl %%v26, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
"vl %%v27, 112(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
"vl %%v16, 64(%%r1,%[y])\n\t"
|
||||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
"vl %%v17, 80(%%r1,%[y])\n\t"
|
||||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
"vl %%v18, 96(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
"vl %%v19, 112(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
|
||||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmsb %%v28,%%v24,%%v0\n\t"
|
"vfmsb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmsb %%v29,%%v25,%%v0\n\t"
|
"vfmsb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -99,25 +93,22 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 64(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
"vst %%v29, 80(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
"vst %%v30, 96(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
"vst %%v31, 112(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
"vst %%v20, 64(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
"vst %%v21, 80(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
"vst %%v22, 96(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
"vst %%v23, 112(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
"vl %%v24, 128(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
"vl %%v26, 160(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
"vl %%v27, 176(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
"vl %%v16, 128(%%r1,%[y])\n\t"
|
||||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
"vl %%v17, 144(%%r1,%[y])\n\t"
|
||||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
"vl %%v18, 160(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
"vl %%v19, 176(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
|
||||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmsb %%v28,%%v24,%%v0\n\t"
|
"vfmsb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmsb %%v29,%%v25,%%v0\n\t"
|
"vfmsb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -135,25 +126,22 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 128(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
"vst %%v29, 144(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
"vst %%v30, 160(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
"vst %%v31, 176(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
"vst %%v20, 128(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
"vst %%v21, 144(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
"vst %%v22, 160(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
"vst %%v23, 176(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
"vl %%v24, 192(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25, 208(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
"vl %%v26, 224(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
"vl %%v27, 240(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
"vl %%v16, 192(%%r1,%[y])\n\t"
|
||||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
"vl %%v17, 208(%%r1,%[y])\n\t"
|
||||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
"vl %%v18, 224(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
"vl %%v19, 240(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
|
||||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmsb %%v28,%%v24,%%v0\n\t"
|
"vfmsb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmsb %%v29,%%v25,%%v0\n\t"
|
"vfmsb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -171,40 +159,39 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 192(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
"vst %%v29, 208(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
"vst %%v30, 224(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
"vst %%v31, 240(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
"vst %%v20, 192(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
"vst %%v21, 208(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
"vst %%v22, 224(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
"vst %%v23, 240(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
|
||||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
|
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||||
{
|
FLOAT c, FLOAT s) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
FLOAT temp[2];
|
FLOAT temp[2];
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
BLASLONG inc_y2;
|
BLASLONG inc_y2;
|
||||||
|
|
||||||
if ( n <= 0 ) return(0);
|
if (n <= 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1) )
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -32;
|
BLASLONG n1 = n & -32;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
FLOAT cosa, sina;
|
FLOAT cosa, sina;
|
||||||
cosa = c;
|
cosa = c;
|
||||||
sina = s;
|
sina = s;
|
||||||
|
|
@ -213,8 +200,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
ix = 2 * n1;
|
ix = 2 * n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
temp[0] = c * x[ix] + s * y[ix];
|
temp[0] = c * x[ix] + s * y[ix];
|
||||||
temp[1] = c * x[ix + 1] + s * y[ix + 1];
|
temp[1] = c * x[ix + 1] + s * y[ix + 1];
|
||||||
y[ix] = c * y[ix] - s * x[ix];
|
y[ix] = c * y[ix] - s * x[ix];
|
||||||
|
|
@ -227,14 +213,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
inc_x2 = 2 * inc_x;
|
inc_x2 = 2 * inc_x;
|
||||||
inc_y2 = 2 * inc_y;
|
inc_y2 = 2 * inc_y;
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
temp[0] = c * x[ix] + s * y[iy];
|
temp[0] = c * x[ix] + s * y[iy];
|
||||||
temp[1] = c * x[ix + 1] + s * y[iy + 1];
|
temp[1] = c * x[ix + 1] + s * y[iy + 1];
|
||||||
y[iy] = c * y[iy] - s * x[ix];
|
y[iy] = c * y[iy] - s * x[ix];
|
||||||
|
|
@ -252,5 +234,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013 - 2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,28 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) {
|
||||||
{
|
__asm__("vlrepf %%v0,0(%[alpha])\n\t"
|
||||||
__asm__ volatile(
|
"vlef %%v1,4(%[alpha]),0\n\t"
|
||||||
"vlrepf %%v0,0(%1) \n\t"
|
"vlef %%v1,4(%[alpha]),2\n\t"
|
||||||
"vlef %%v1,4(%1),0 \n\t"
|
|
||||||
"vlef %%v1,4(%1),2 \n\t"
|
|
||||||
"vflcsb %%v1,%%v1\n\t"
|
"vflcsb %%v1,%%v1\n\t"
|
||||||
"vlef %%v1,4(%1),1 \n\t"
|
"vlef %%v1,4(%[alpha]),1\n\t"
|
||||||
"vlef %%v1,4(%1),3 \n\t"
|
"vlef %%v1,4(%[alpha]),3\n\t"
|
||||||
"srlg %%r0,%0,4 \n\t"
|
"srlg %[n],%[n],4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
|
||||||
"verllg %%v24,%%v16,32\n\t"
|
"verllg %%v24,%%v16,32\n\t"
|
||||||
"verllg %%v25,%%v17,32\n\t"
|
"verllg %%v25,%%v17,32\n\t"
|
||||||
"verllg %%v26,%%v18,32\n\t"
|
"verllg %%v26,%%v18,32\n\t"
|
||||||
|
|
@ -57,7 +54,6 @@ static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||||
"verllg %%v29,%%v21,32\n\t"
|
"verllg %%v29,%%v21,32\n\t"
|
||||||
"verllg %%v30,%%v22,32\n\t"
|
"verllg %%v30,%%v22,32\n\t"
|
||||||
"verllg %%v31,%%v23,32\n\t"
|
"verllg %%v31,%%v23,32\n\t"
|
||||||
|
|
||||||
"vfmsb %%v16,%%v16,%%v0\n\t"
|
"vfmsb %%v16,%%v16,%%v0\n\t"
|
||||||
"vfmsb %%v17,%%v17,%%v0\n\t"
|
"vfmsb %%v17,%%v17,%%v0\n\t"
|
||||||
"vfmsb %%v18,%%v18,%%v0\n\t"
|
"vfmsb %%v18,%%v18,%%v0\n\t"
|
||||||
|
|
@ -74,45 +70,42 @@ static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||||
"vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
|
"vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
|
||||||
"vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
|
"vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
|
||||||
"vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
|
"vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
|
||||||
|
"vst %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vst %%v16,0(%%r1,%2) \n\t"
|
"vst %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vst %%v17,16(%%r1,%2) \n\t"
|
"vst %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vst %%v18,32(%%r1,%2) \n\t"
|
"vst %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vst %%v19,48(%%r1,%2) \n\t"
|
"vst %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vst %%v20,64(%%r1,%2) \n\t"
|
"vst %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vst %%v21,80(%%r1,%2) \n\t"
|
"vst %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vst %%v22,96(%%r1,%2) \n\t"
|
"vst %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vst %%v23,112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
|
||||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
|
||||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
[alpha] "a"(alpha)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
|
||||||
{
|
__asm__("vlef %%v0,4(%[alpha]),0\n\t"
|
||||||
__asm__ volatile(
|
"vlef %%v0,4(%[alpha]),2\n\t"
|
||||||
"vlef %%v0,4(%1),0 \n\t"
|
|
||||||
"vlef %%v0,4(%1),2 \n\t"
|
|
||||||
"vflcsb %%v0,%%v0\n\t"
|
"vflcsb %%v0,%%v0\n\t"
|
||||||
"vlef %%v0,4(%1),1 \n\t"
|
"vlef %%v0,4(%[alpha]),1\n\t"
|
||||||
"vlef %%v0,4(%1),3 \n\t"
|
"vlef %%v0,4(%[alpha]),3\n\t"
|
||||||
"srlg %%r0,%0,4 \n\t"
|
"srlg %[n],%[n],4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
|
||||||
"verllg %%v16,%%v16,32\n\t"
|
"verllg %%v16,%%v16,32\n\t"
|
||||||
"verllg %%v17,%%v17,32\n\t"
|
"verllg %%v17,%%v17,32\n\t"
|
||||||
"verllg %%v18,%%v18,32\n\t"
|
"verllg %%v18,%%v18,32\n\t"
|
||||||
|
|
@ -121,7 +114,6 @@ static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||||
"verllg %%v21,%%v21,32\n\t"
|
"verllg %%v21,%%v21,32\n\t"
|
||||||
"verllg %%v22,%%v22,32\n\t"
|
"verllg %%v22,%%v22,32\n\t"
|
||||||
"verllg %%v23,%%v23,32\n\t"
|
"verllg %%v23,%%v23,32\n\t"
|
||||||
|
|
||||||
"vfmsb %%v16,%%v16,%%v0\n\t"
|
"vfmsb %%v16,%%v16,%%v0\n\t"
|
||||||
"vfmsb %%v17,%%v17,%%v0\n\t"
|
"vfmsb %%v17,%%v17,%%v0\n\t"
|
||||||
"vfmsb %%v18,%%v18,%%v0\n\t"
|
"vfmsb %%v18,%%v18,%%v0\n\t"
|
||||||
|
|
@ -130,42 +122,37 @@ static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||||
"vfmsb %%v21,%%v21,%%v0\n\t"
|
"vfmsb %%v21,%%v21,%%v0\n\t"
|
||||||
"vfmsb %%v22,%%v22,%%v0\n\t"
|
"vfmsb %%v22,%%v22,%%v0\n\t"
|
||||||
"vfmsb %%v23,%%v23,%%v0\n\t"
|
"vfmsb %%v23,%%v23,%%v0\n\t"
|
||||||
|
"vst %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vst %%v16,0(%%r1,%2) \n\t"
|
"vst %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vst %%v17,16(%%r1,%2) \n\t"
|
"vst %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vst %%v18,32(%%r1,%2) \n\t"
|
"vst %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vst %%v19,48(%%r1,%2) \n\t"
|
"vst %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vst %%v20,64(%%r1,%2) \n\t"
|
"vst %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vst %%v21,80(%%r1,%2) \n\t"
|
"vst %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vst %%v22,96(%%r1,%2) \n\t"
|
"vst %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vst %%v23,112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
|
||||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
[alpha] "a"(alpha)
|
||||||
);
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
|
"v23");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
|
||||||
{
|
__asm__("vlrepf %%v0,0(%[alpha])\n\t"
|
||||||
__asm__ volatile(
|
"srlg %[n],%[n],4\n\t"
|
||||||
"vlrepf %%v0,0(%1) \n\t"
|
|
||||||
"srlg %%r0,%0,4 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmsb %%v16,%%v16,%%v0\n\t"
|
"vfmsb %%v16,%%v16,%%v0\n\t"
|
||||||
"vfmsb %%v17,%%v17,%%v0\n\t"
|
"vfmsb %%v17,%%v17,%%v0\n\t"
|
||||||
"vfmsb %%v18,%%v18,%%v0\n\t"
|
"vfmsb %%v18,%%v18,%%v0\n\t"
|
||||||
|
|
@ -174,55 +161,46 @@ static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||||
"vfmsb %%v21,%%v21,%%v0\n\t"
|
"vfmsb %%v21,%%v21,%%v0\n\t"
|
||||||
"vfmsb %%v22,%%v22,%%v0\n\t"
|
"vfmsb %%v22,%%v22,%%v0\n\t"
|
||||||
"vfmsb %%v23,%%v23,%%v0\n\t"
|
"vfmsb %%v23,%%v23,%%v0\n\t"
|
||||||
|
"vst %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vst %%v16,0(%%r1,%2) \n\t"
|
"vst %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vst %%v17,16(%%r1,%2) \n\t"
|
"vst %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vst %%v18,32(%%r1,%2) \n\t"
|
"vst %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vst %%v19,48(%%r1,%2) \n\t"
|
"vst %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vst %%v20,64(%%r1,%2) \n\t"
|
"vst %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vst %%v21,80(%%r1,%2) \n\t"
|
"vst %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vst %%v22,96(%%r1,%2) \n\t"
|
"vst %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vst %%v23,112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
|
||||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
[alpha] "a"(alpha)
|
||||||
);
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
|
"v23");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x)
|
static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
|
||||||
{
|
__asm__("vzero %%v0\n\t"
|
||||||
__asm__ volatile(
|
"srlg %[n],%[n],4\n\t"
|
||||||
"vzero %%v24 \n\t"
|
|
||||||
"vzero %%v25 \n\t"
|
|
||||||
"vzero %%v26 \n\t"
|
|
||||||
"vzero %%v27 \n\t"
|
|
||||||
"srlg %%r0,%0,4 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vst %%v0,0(%%r1,%[x])\n\t"
|
||||||
"vst %%v24,0(%%r1,%1) \n\t"
|
"vst %%v0,16(%%r1,%[x])\n\t"
|
||||||
"vst %%v25,16(%%r1,%1) \n\t"
|
"vst %%v0,32(%%r1,%[x])\n\t"
|
||||||
"vst %%v26,32(%%r1,%1) \n\t"
|
"vst %%v0,48(%%r1,%[x])\n\t"
|
||||||
"vst %%v27,48(%%r1,%1) \n\t"
|
"vst %%v0,64(%%r1,%[x])\n\t"
|
||||||
"vst %%v24,64(%%r1,%1) \n\t"
|
"vst %%v0,80(%%r1,%[x])\n\t"
|
||||||
"vst %%v25,80(%%r1,%1) \n\t"
|
"vst %%v0,96(%%r1,%[x])\n\t"
|
||||||
"vst %%v26,96(%%r1,%1) \n\t"
|
"vst %%v0,112(%%r1,%[x])\n\t"
|
||||||
"vst %%v27,112(%%r1,%1) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
|
: [x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v24","v25","v26","v27"
|
: "cc", "r1", "v0");
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
|
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
|
||||||
{
|
BLASLONG inc_x) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
BLASLONG inc_x2 = 2 * inc_x;
|
BLASLONG inc_x2 = 2 * inc_x;
|
||||||
BLASLONG inc_x3 = inc_x2 + inc_x;
|
BLASLONG inc_x3 = inc_x2 + inc_x;
|
||||||
|
|
@ -230,8 +208,7 @@ static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
|
||||||
FLOAT da_r = alpha[0];
|
FLOAT da_r = alpha[0];
|
||||||
FLOAT da_i = alpha[1];
|
FLOAT da_i = alpha[1];
|
||||||
|
|
||||||
for (i = 0; i < n; i += 4)
|
for (i = 0; i < n; i += 4) {
|
||||||
{
|
|
||||||
t0 = da_r * x[0] - da_i * x[1];
|
t0 = da_r * x[0] - da_i * x[1];
|
||||||
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
|
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
|
||||||
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
|
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
|
||||||
|
|
@ -251,7 +228,9 @@ static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
|
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||||
|
BLASLONG dummy2) {
|
||||||
BLASLONG i = 0, j = 0;
|
BLASLONG i = 0, j = 0;
|
||||||
FLOAT temp0;
|
FLOAT temp0;
|
||||||
FLOAT temp1;
|
FLOAT temp1;
|
||||||
|
|
@ -311,13 +290,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
|
||||||
if (da_i == 0.0) {
|
if (da_i == 0.0) {
|
||||||
BLASLONG n1 = n & -2;
|
BLASLONG n1 = n & -2;
|
||||||
|
|
||||||
|
|
@ -372,7 +348,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -16;
|
BLASLONG n1 = n & -16;
|
||||||
if (n1 > 0) {
|
if (n1 > 0) {
|
||||||
|
|
||||||
|
|
@ -384,8 +359,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
cscal_kernel_16_zero(n1, x);
|
cscal_kernel_16_zero(n1, x);
|
||||||
else
|
else
|
||||||
cscal_kernel_16_zero_r(n1, alpha, x);
|
cscal_kernel_16_zero_r(n1, alpha, x);
|
||||||
else
|
else if (da_i == 0)
|
||||||
if (da_i == 0)
|
|
||||||
cscal_kernel_16_zero_i(n1, alpha, x);
|
cscal_kernel_16_zero_i(n1, alpha, x);
|
||||||
else
|
else
|
||||||
cscal_kernel_16(n1, alpha, x);
|
cscal_kernel_16(n1, alpha, x);
|
||||||
|
|
@ -394,7 +368,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
j = n1;
|
j = n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (da_r == 0.0) {
|
if (da_r == 0.0) {
|
||||||
|
|
||||||
if (da_i == 0.0) {
|
if (da_i == 0.0) {
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,114 +27,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||||
{
|
__asm__("srlg %[n],%[n],5\n\t"
|
||||||
__asm__ volatile(
|
|
||||||
"srlg %%r0,%0,5 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
"vl %%v24, 128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
"vl %%v25, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
"vl %%v26, 160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
"vl %%v27, 176(%%r1,%[x])\n\t"
|
||||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
"vl %%v28, 192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
"vl %%v29, 208(%%r1,%[x])\n\t"
|
||||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
"vl %%v30, 224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
"vl %%v31, 240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
"vl %%v0, 0(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v1, 16(%%r1,%[y])\n\t"
|
||||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
"vl %%v2, 32(%%r1,%[y])\n\t"
|
||||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
"vl %%v3, 48(%%r1,%[y])\n\t"
|
||||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
"vl %%v4, 64(%%r1,%[y])\n\t"
|
||||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
"vl %%v5, 80(%%r1,%[y])\n\t"
|
||||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
"vl %%v6, 96(%%r1,%[y])\n\t"
|
||||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
"vl %%v7, 112(%%r1,%[y])\n\t"
|
||||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
"vst %%v0, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
"vst %%v1, 16(%%r1,%[x])\n\t"
|
||||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
"vst %%v2, 32(%%r1,%[x])\n\t"
|
||||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
"vst %%v3, 48(%%r1,%[x])\n\t"
|
||||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
"vst %%v4, 64(%%r1,%[x])\n\t"
|
||||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
"vst %%v5, 80(%%r1,%[x])\n\t"
|
||||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
"vst %%v6, 96(%%r1,%[x])\n\t"
|
||||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
"vst %%v7, 112(%%r1,%[x])\n\t"
|
||||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
"vl %%v0, 128(%%r1,%[y])\n\t"
|
||||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
"vl %%v1, 144(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v2, 160(%%r1,%[y])\n\t"
|
||||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
"vl %%v3, 176(%%r1,%[y])\n\t"
|
||||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
"vl %%v4, 192(%%r1,%[y])\n\t"
|
||||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
"vl %%v5, 208(%%r1,%[y])\n\t"
|
||||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
"vl %%v6, 224(%%r1,%[y])\n\t"
|
||||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
"vl %%v7, 240(%%r1,%[y])\n\t"
|
||||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
"vst %%v0, 128(%%r1,%[x])\n\t"
|
||||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
"vst %%v1, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
"vst %%v2, 160(%%r1,%[x])\n\t"
|
||||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
"vst %%v3, 176(%%r1,%[x])\n\t"
|
||||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
"vst %%v4, 192(%%r1,%[x])\n\t"
|
||||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
"vst %%v5, 208(%%r1,%[x])\n\t"
|
||||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
"vst %%v6, 224(%%r1,%[x])\n\t"
|
||||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
"vst %%v7, 240(%%r1,%[x])\n\t"
|
||||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
"vst %%v16, 0(%%r1,%[y])\n\t"
|
||||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
"vst %%v17, 16(%%r1,%[y])\n\t"
|
||||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
"vst %%v18, 32(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v19, 48(%%r1,%[y])\n\t"
|
||||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
"vst %%v20, 64(%%r1,%[y])\n\t"
|
||||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
"vst %%v21, 80(%%r1,%[y])\n\t"
|
||||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
"vst %%v22, 96(%%r1,%[y])\n\t"
|
||||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
"vst %%v23, 112(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
"vst %%v24, 128(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
"vst %%v25, 144(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
"vst %%v26, 160(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
"vst %%v27, 176(%%r1,%[y])\n\t"
|
||||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
"vst %%v28, 192(%%r1,%[y])\n\t"
|
||||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
"vst %%v29, 208(%%r1,%[y])\n\t"
|
||||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
"vst %%v30, 224(%%r1,%[y])\n\t"
|
||||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
"vst %%v31, 240(%%r1,%[y])\n\t"
|
||||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
|
||||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
|
||||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
|
||||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
|
||||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
|
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: [x] "a"(x),[y] "a"(y)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
|
||||||
{
|
FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||||
|
FLOAT *dummy, BLASLONG dummy2) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
FLOAT temp[2];
|
FLOAT temp[2];
|
||||||
BLASLONG inc_x2, inc_y2;
|
BLASLONG inc_x2, inc_y2;
|
||||||
|
|
||||||
if ( n <= 0 ) return(0);
|
if (n <= 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -32;
|
BLASLONG n1 = n & -32;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
cswap_kernel_32(n1, x, y);
|
cswap_kernel_32(n1, x, y);
|
||||||
i = n1;
|
i = n1;
|
||||||
ix = 2 * n1;
|
ix = 2 * n1;
|
||||||
iy = 2 * n1;
|
iy = 2 * n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
temp[0] = x[ix];
|
temp[0] = x[ix];
|
||||||
temp[1] = x[ix + 1];
|
temp[1] = x[ix + 1];
|
||||||
|
|
@ -147,19 +141,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
|
||||||
iy += 2;
|
iy += 2;
|
||||||
i++;
|
i++;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
inc_x2 = 2 * inc_x;
|
inc_x2 = 2 * inc_x;
|
||||||
inc_y2 = 2 * inc_y;
|
inc_y2 = 2 * inc_y;
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
temp[0] = x[ix];
|
temp[0] = x[ix];
|
||||||
temp[1] = x[ix + 1];
|
temp[1] = x[ix + 1];
|
||||||
|
|
@ -177,7 +166,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
|
||||||
}
|
}
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,40 +28,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
#define ABS fabs
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
|
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT amax;
|
FLOAT amax;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%2) \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"srlg %%r0,%1,5 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
"vl %%v24,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24,128(%%r1,%2) \n\t"
|
"vl %%v25,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v25,144(%%r1,%2) \n\t"
|
"vl %%v26,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26,160(%%r1,%2) \n\t"
|
"vl %%v27,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v27,176(%%r1,%2) \n\t"
|
"vl %%v28,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,192(%%r1,%2) \n\t"
|
"vl %%v29,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v29,208(%%r1,%2) \n\t"
|
"vl %%v30,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,224(%%r1,%2) \n\t"
|
"vl %%v31,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31,240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmaxdb %%v16,%%v16,%%v24,8\n\t"
|
"vfmaxdb %%v16,%%v16,%%v24,8\n\t"
|
||||||
"vfmaxdb %%v17,%%v17,%%v25,8\n\t"
|
"vfmaxdb %%v17,%%v17,%%v25,8\n\t"
|
||||||
"vfmaxdb %%v18,%%v18,%%v26,8\n\t"
|
"vfmaxdb %%v18,%%v18,%%v26,8\n\t"
|
||||||
|
|
@ -70,29 +62,23 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vfmaxdb %%v21,%%v21,%%v29,8\n\t"
|
"vfmaxdb %%v21,%%v21,%%v29,8\n\t"
|
||||||
"vfmaxdb %%v22,%%v22,%%v30,8\n\t"
|
"vfmaxdb %%v22,%%v22,%%v30,8\n\t"
|
||||||
"vfmaxdb %%v23,%%v23,%%v31,8\n\t"
|
"vfmaxdb %%v23,%%v23,%%v31,8\n\t"
|
||||||
|
|
||||||
"vfmaxdb %%v16,%%v16,%%v20,8\n\t"
|
"vfmaxdb %%v16,%%v16,%%v20,8\n\t"
|
||||||
"vfmaxdb %%v17,%%v17,%%v21,8\n\t"
|
"vfmaxdb %%v17,%%v17,%%v21,8\n\t"
|
||||||
"vfmaxdb %%v18,%%v18,%%v22,8\n\t"
|
"vfmaxdb %%v18,%%v18,%%v22,8\n\t"
|
||||||
"vfmaxdb %%v19,%%v19,%%v23,8\n\t"
|
"vfmaxdb %%v19,%%v19,%%v23,8\n\t"
|
||||||
|
|
||||||
"vfmaxdb %%v16,%%v16,%%v18,8\n\t"
|
"vfmaxdb %%v16,%%v16,%%v18,8\n\t"
|
||||||
"vfmaxdb %%v17,%%v17,%%v19,8\n\t"
|
"vfmaxdb %%v17,%%v17,%%v19,8\n\t"
|
||||||
|
|
||||||
"vfmaxdb %%v16,%%v16,%%v17,8\n\t"
|
"vfmaxdb %%v16,%%v16,%%v17,8\n\t"
|
||||||
|
|
||||||
"vfmaxdb %%v0,%%v0,%%v16,8\n\t"
|
"vfmaxdb %%v0,%%v0,%%v16,8\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v16,%%v0,1\n\t"
|
"vrepg %%v16,%%v0,1\n\t"
|
||||||
"wfmaxdb %%v0,%%v0,%%v16,8\n\t"
|
"wfmaxdb %%v0,%%v0,%%v16,8\n\t"
|
||||||
"lpdr %0,%%f0 "
|
"lpdr %[amax],%%f0"
|
||||||
:"=f"(amax)
|
: [amax] "=f"(amax),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return amax;
|
return amax;
|
||||||
}
|
}
|
||||||
|
|
@ -102,7 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
BLASLONG j = 0;
|
BLASLONG j = 0;
|
||||||
FLOAT maxf = 0.0;
|
FLOAT maxf = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (maxf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -112,9 +99,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
maxf = damax_kernel_32(n1, x);
|
maxf = damax_kernel_32(n1, x);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = ABS(x[0]);
|
maxf = ABS(x[0]);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -153,7 +138,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (ABS(x[i]) > maxf) {
|
if (ABS(x[i]) > maxf) {
|
||||||
maxf = ABS(x[i]);
|
maxf = ABS(x[i]);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,32 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
#define ABS fabs
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
|
static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT amax;
|
FLOAT amax;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%2) \n\t"
|
|
||||||
"vflpdb %%v0,%%v0\n\t"
|
"vflpdb %%v0,%%v0\n\t"
|
||||||
"srlg %%r0,%1,5 \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -62,7 +55,6 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vflpdb %%v21, %%v21\n\t"
|
"vflpdb %%v21, %%v21\n\t"
|
||||||
"vflpdb %%v22, %%v22\n\t"
|
"vflpdb %%v22, %%v22\n\t"
|
||||||
"vflpdb %%v23, %%v23\n\t"
|
"vflpdb %%v23, %%v23\n\t"
|
||||||
|
|
||||||
"vfchdb %%v24,%%v16,%%v17\n\t"
|
"vfchdb %%v24,%%v16,%%v17\n\t"
|
||||||
"vfchdb %%v25,%%v18,%%v19\n\t"
|
"vfchdb %%v25,%%v18,%%v19\n\t"
|
||||||
"vfchdb %%v26,%%v20,%%v21\n\t"
|
"vfchdb %%v26,%%v20,%%v21\n\t"
|
||||||
|
|
@ -71,26 +63,22 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
||||||
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
||||||
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
||||||
|
|
||||||
"vfchdb %%v28,%%v24,%%v25\n\t"
|
"vfchdb %%v28,%%v24,%%v25\n\t"
|
||||||
"vfchdb %%v29,%%v26,%%v27\n\t"
|
"vfchdb %%v29,%%v26,%%v27\n\t"
|
||||||
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
||||||
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
||||||
|
|
||||||
"vfchdb %%v30,%%v28,%%v29\n\t"
|
"vfchdb %%v30,%%v28,%%v29\n\t"
|
||||||
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
||||||
|
|
||||||
"vfchdb %%v31,%%v30,%%v0\n\t"
|
"vfchdb %%v31,%%v30,%%v0\n\t"
|
||||||
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%2) \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,144(%%r1,%2) \n\t"
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%2) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,176(%%r1,%2) \n\t"
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%2) \n\t"
|
"vl %%v21,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,208(%%r1,%2) \n\t"
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%2) \n\t"
|
"vl %%v23,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,240(%%r1,%2) \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -99,7 +87,6 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vflpdb %%v21, %%v21\n\t"
|
"vflpdb %%v21, %%v21\n\t"
|
||||||
"vflpdb %%v22, %%v22\n\t"
|
"vflpdb %%v22, %%v22\n\t"
|
||||||
"vflpdb %%v23, %%v23\n\t"
|
"vflpdb %%v23, %%v23\n\t"
|
||||||
|
|
||||||
"vfchdb %%v24,%%v16,%%v17\n\t"
|
"vfchdb %%v24,%%v16,%%v17\n\t"
|
||||||
"vfchdb %%v25,%%v18,%%v19\n\t"
|
"vfchdb %%v25,%%v18,%%v19\n\t"
|
||||||
"vfchdb %%v26,%%v20,%%v21\n\t"
|
"vfchdb %%v26,%%v20,%%v21\n\t"
|
||||||
|
|
@ -108,29 +95,24 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
||||||
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
||||||
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
||||||
|
|
||||||
"vfchdb %%v28,%%v24,%%v25\n\t"
|
"vfchdb %%v28,%%v24,%%v25\n\t"
|
||||||
"vfchdb %%v29,%%v26,%%v27\n\t"
|
"vfchdb %%v29,%%v26,%%v27\n\t"
|
||||||
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
||||||
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
||||||
|
|
||||||
"vfchdb %%v30,%%v28,%%v29\n\t"
|
"vfchdb %%v30,%%v28,%%v29\n\t"
|
||||||
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
||||||
|
|
||||||
"vfchdb %%v31,%%v30,%%v0\n\t"
|
"vfchdb %%v31,%%v30,%%v0\n\t"
|
||||||
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v16,%%v0,1\n\t"
|
"vrepg %%v16,%%v0,1\n\t"
|
||||||
"wfchdb %%v17,%%v0,%%v16\n\t"
|
"wfchdb %%v17,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
|
||||||
"ldr %0,%%f0 "
|
"ldr %[amax],%%f0"
|
||||||
:"=f"(amax)
|
: [amax] "=f"(amax),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return amax;
|
return amax;
|
||||||
}
|
}
|
||||||
|
|
@ -140,7 +122,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
BLASLONG j = 0;
|
BLASLONG j = 0;
|
||||||
FLOAT maxf = 0.0;
|
FLOAT maxf = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (maxf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -150,9 +133,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
maxf = damax_kernel_32(n1, x);
|
maxf = damax_kernel_32(n1, x);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = ABS(x[0]);
|
maxf = ABS(x[0]);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -191,7 +172,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (ABS(x[i]) > maxf) {
|
if (ABS(x[i]) > maxf) {
|
||||||
maxf = ABS(x[i]);
|
maxf = ABS(x[i]);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,40 +28,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
#define ABS fabs
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
|
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT amin;
|
FLOAT amin;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%2) \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"srlg %%r0,%1,5 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
"vl %%v24,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24,128(%%r1,%2) \n\t"
|
"vl %%v25,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v25,144(%%r1,%2) \n\t"
|
"vl %%v26,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26,160(%%r1,%2) \n\t"
|
"vl %%v27,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v27,176(%%r1,%2) \n\t"
|
"vl %%v28,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,192(%%r1,%2) \n\t"
|
"vl %%v29,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v29,208(%%r1,%2) \n\t"
|
"vl %%v30,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,224(%%r1,%2) \n\t"
|
"vl %%v31,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31,240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmindb %%v16,%%v16,%%v24,8\n\t"
|
"vfmindb %%v16,%%v16,%%v24,8\n\t"
|
||||||
"vfmindb %%v17,%%v17,%%v25,8\n\t"
|
"vfmindb %%v17,%%v17,%%v25,8\n\t"
|
||||||
"vfmindb %%v18,%%v18,%%v26,8\n\t"
|
"vfmindb %%v18,%%v18,%%v26,8\n\t"
|
||||||
|
|
@ -70,29 +62,23 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vfmindb %%v21,%%v21,%%v29,8\n\t"
|
"vfmindb %%v21,%%v21,%%v29,8\n\t"
|
||||||
"vfmindb %%v22,%%v22,%%v30,8\n\t"
|
"vfmindb %%v22,%%v22,%%v30,8\n\t"
|
||||||
"vfmindb %%v23,%%v23,%%v31,8\n\t"
|
"vfmindb %%v23,%%v23,%%v31,8\n\t"
|
||||||
|
|
||||||
"vfmindb %%v16,%%v16,%%v20,8\n\t"
|
"vfmindb %%v16,%%v16,%%v20,8\n\t"
|
||||||
"vfmindb %%v17,%%v17,%%v21,8\n\t"
|
"vfmindb %%v17,%%v17,%%v21,8\n\t"
|
||||||
"vfmindb %%v18,%%v18,%%v22,8\n\t"
|
"vfmindb %%v18,%%v18,%%v22,8\n\t"
|
||||||
"vfmindb %%v19,%%v19,%%v23,8\n\t"
|
"vfmindb %%v19,%%v19,%%v23,8\n\t"
|
||||||
|
|
||||||
"vfmindb %%v16,%%v16,%%v18,8\n\t"
|
"vfmindb %%v16,%%v16,%%v18,8\n\t"
|
||||||
"vfmindb %%v17,%%v17,%%v19,8\n\t"
|
"vfmindb %%v17,%%v17,%%v19,8\n\t"
|
||||||
|
|
||||||
"vfmindb %%v16,%%v16,%%v17,8\n\t"
|
"vfmindb %%v16,%%v16,%%v17,8\n\t"
|
||||||
|
|
||||||
"vfmindb %%v0,%%v0,%%v16,8\n\t"
|
"vfmindb %%v0,%%v0,%%v16,8\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v16,%%v0,1\n\t"
|
"vrepg %%v16,%%v0,1\n\t"
|
||||||
"wfmindb %%v0,%%v0,%%v16,8\n\t"
|
"wfmindb %%v0,%%v0,%%v16,8\n\t"
|
||||||
"lpdr %0,%%f0 "
|
"lpdr %[amin],%%f0"
|
||||||
:"=f"(amin)
|
: [amin] "=f"(amin),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return amin;
|
return amin;
|
||||||
}
|
}
|
||||||
|
|
@ -102,7 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
BLASLONG j = 0;
|
BLASLONG j = 0;
|
||||||
FLOAT minf = 0.0;
|
FLOAT minf = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (minf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (minf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -112,9 +99,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
minf = damin_kernel_32(n1, x);
|
minf = damin_kernel_32(n1, x);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = ABS(x[0]);
|
minf = ABS(x[0]);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -153,7 +138,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (ABS(x[i]) < minf) {
|
if (ABS(x[i]) < minf) {
|
||||||
minf = ABS(x[i]);
|
minf = ABS(x[i]);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,32 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
#define ABS fabs
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
|
static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT amin;
|
FLOAT amin;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%2) \n\t"
|
|
||||||
"vflpdb %%v0,%%v0\n\t"
|
"vflpdb %%v0,%%v0\n\t"
|
||||||
"srlg %%r0,%1,5 \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -62,7 +55,6 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vflpdb %%v21, %%v21\n\t"
|
"vflpdb %%v21, %%v21\n\t"
|
||||||
"vflpdb %%v22, %%v22\n\t"
|
"vflpdb %%v22, %%v22\n\t"
|
||||||
"vflpdb %%v23, %%v23\n\t"
|
"vflpdb %%v23, %%v23\n\t"
|
||||||
|
|
||||||
"vfchdb %%v24,%%v17,%%v16\n\t"
|
"vfchdb %%v24,%%v17,%%v16\n\t"
|
||||||
"vfchdb %%v25,%%v19,%%v18\n\t"
|
"vfchdb %%v25,%%v19,%%v18\n\t"
|
||||||
"vfchdb %%v26,%%v21,%%v20\n\t"
|
"vfchdb %%v26,%%v21,%%v20\n\t"
|
||||||
|
|
@ -71,26 +63,22 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
||||||
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
||||||
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
||||||
|
|
||||||
"vfchdb %%v28,%%v25,%%v24\n\t"
|
"vfchdb %%v28,%%v25,%%v24\n\t"
|
||||||
"vfchdb %%v29,%%v27,%%v26\n\t"
|
"vfchdb %%v29,%%v27,%%v26\n\t"
|
||||||
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
||||||
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
||||||
|
|
||||||
"vfchdb %%v30,%%v29,%%v28\n\t"
|
"vfchdb %%v30,%%v29,%%v28\n\t"
|
||||||
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
||||||
|
|
||||||
"vfchdb %%v31,%%v0,%%v30\n\t"
|
"vfchdb %%v31,%%v0,%%v30\n\t"
|
||||||
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%2) \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,144(%%r1,%2) \n\t"
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%2) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,176(%%r1,%2) \n\t"
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%2) \n\t"
|
"vl %%v21,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,208(%%r1,%2) \n\t"
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%2) \n\t"
|
"vl %%v23,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,240(%%r1,%2) \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -99,7 +87,6 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vflpdb %%v21, %%v21\n\t"
|
"vflpdb %%v21, %%v21\n\t"
|
||||||
"vflpdb %%v22, %%v22\n\t"
|
"vflpdb %%v22, %%v22\n\t"
|
||||||
"vflpdb %%v23, %%v23\n\t"
|
"vflpdb %%v23, %%v23\n\t"
|
||||||
|
|
||||||
"vfchdb %%v24,%%v17,%%v16\n\t"
|
"vfchdb %%v24,%%v17,%%v16\n\t"
|
||||||
"vfchdb %%v25,%%v19,%%v18\n\t"
|
"vfchdb %%v25,%%v19,%%v18\n\t"
|
||||||
"vfchdb %%v26,%%v21,%%v20\n\t"
|
"vfchdb %%v26,%%v21,%%v20\n\t"
|
||||||
|
|
@ -108,29 +95,24 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
||||||
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
||||||
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
||||||
|
|
||||||
"vfchdb %%v28,%%v25,%%v24\n\t"
|
"vfchdb %%v28,%%v25,%%v24\n\t"
|
||||||
"vfchdb %%v29,%%v27,%%v26\n\t"
|
"vfchdb %%v29,%%v27,%%v26\n\t"
|
||||||
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
||||||
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
||||||
|
|
||||||
"vfchdb %%v30,%%v29,%%v28\n\t"
|
"vfchdb %%v30,%%v29,%%v28\n\t"
|
||||||
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
||||||
|
|
||||||
"vfchdb %%v31,%%v0,%%v30\n\t"
|
"vfchdb %%v31,%%v0,%%v30\n\t"
|
||||||
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v16,%%v0,1\n\t"
|
"vrepg %%v16,%%v0,1\n\t"
|
||||||
"wfchdb %%v17,%%v16,%%v0\n\t"
|
"wfchdb %%v17,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
|
||||||
"ldr %0,%%f0 "
|
"ldr %[amin],%%f0"
|
||||||
:"=f"(amin)
|
: [amin] "=f"(amin),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return amin;
|
return amin;
|
||||||
}
|
}
|
||||||
|
|
@ -140,7 +122,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
BLASLONG j = 0;
|
BLASLONG j = 0;
|
||||||
FLOAT minf = 0.0;
|
FLOAT minf = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (minf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (minf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -150,9 +133,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
minf = damin_kernel_32(n1, x);
|
minf = damin_kernel_32(n1, x);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = ABS(x[0]);
|
minf = ABS(x[0]);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -191,7 +172,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (ABS(x[i]) < minf) {
|
if (ABS(x[i]) < minf) {
|
||||||
minf = ABS(x[i]);
|
minf = ABS(x[i]);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,34 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
#define ABS fabs
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
|
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT asum;
|
FLOAT asum;
|
||||||
|
|
||||||
__asm__ (
|
__asm__("vzero %%v24\n\t"
|
||||||
"vzero %%v0 \n\t"
|
"vzero %%v25\n\t"
|
||||||
"vzero %%v1 \n\t"
|
"vzero %%v26\n\t"
|
||||||
"vzero %%v2 \n\t"
|
"vzero %%v27\n\t"
|
||||||
"vzero %%v3 \n\t"
|
"vzero %%v28\n\t"
|
||||||
"srlg %%r0,%1,5 \n\t"
|
"vzero %%v29\n\t"
|
||||||
|
"vzero %%v30\n\t"
|
||||||
|
"vzero %%v31\n\t"
|
||||||
|
"srlg %[n],%[n],5\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||||
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -64,25 +61,22 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vflpdb %%v21, %%v21\n\t"
|
"vflpdb %%v21, %%v21\n\t"
|
||||||
"vflpdb %%v22, %%v22\n\t"
|
"vflpdb %%v22, %%v22\n\t"
|
||||||
"vflpdb %%v23, %%v23\n\t"
|
"vflpdb %%v23, %%v23\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v16\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
"vfadb %%v25,%%v25,%%v17\n\t"
|
||||||
"vfadb %%v1,%%v1,%%v17 \n\t"
|
"vfadb %%v26,%%v26,%%v18\n\t"
|
||||||
"vfadb %%v2,%%v2,%%v18 \n\t"
|
"vfadb %%v27,%%v27,%%v19\n\t"
|
||||||
"vfadb %%v3,%%v3,%%v19 \n\t"
|
"vfadb %%v28,%%v28,%%v20\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v20 \n\t"
|
"vfadb %%v29,%%v29,%%v21\n\t"
|
||||||
"vfadb %%v1,%%v1,%%v21 \n\t"
|
"vfadb %%v30,%%v30,%%v22\n\t"
|
||||||
"vfadb %%v2,%%v2,%%v22 \n\t"
|
"vfadb %%v31,%%v31,%%v23\n\t"
|
||||||
"vfadb %%v3,%%v3,%%v23 \n\t"
|
"vl %%v16, 128(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v17, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
"vl %%v18, 160(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
"vl %%v19, 176(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
"vl %%v20, 192(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
"vl %%v21, 208(%%r1,%[x])\n\t"
|
||||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
"vl %%v22, 224(%%r1,%[x])\n\t"
|
||||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
"vl %%v23, 240(%%r1,%[x])\n\t"
|
||||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
|
||||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -91,28 +85,30 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vflpdb %%v21, %%v21\n\t"
|
"vflpdb %%v21, %%v21\n\t"
|
||||||
"vflpdb %%v22, %%v22\n\t"
|
"vflpdb %%v22, %%v22\n\t"
|
||||||
"vflpdb %%v23, %%v23\n\t"
|
"vflpdb %%v23, %%v23\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v16\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
"vfadb %%v25,%%v25,%%v17\n\t"
|
||||||
"vfadb %%v1,%%v1,%%v17 \n\t"
|
"vfadb %%v26,%%v26,%%v18\n\t"
|
||||||
"vfadb %%v2,%%v2,%%v18 \n\t"
|
"vfadb %%v27,%%v27,%%v19\n\t"
|
||||||
"vfadb %%v3,%%v3,%%v19 \n\t"
|
"vfadb %%v28,%%v28,%%v20\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v20 \n\t"
|
"vfadb %%v29,%%v29,%%v21\n\t"
|
||||||
"vfadb %%v1,%%v1,%%v21 \n\t"
|
"vfadb %%v30,%%v30,%%v22\n\t"
|
||||||
"vfadb %%v2,%%v2,%%v22 \n\t"
|
"vfadb %%v31,%%v31,%%v23\n\t"
|
||||||
"vfadb %%v3,%%v3,%%v23 \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v1 \n\t"
|
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v2 \n\t"
|
"vfadb %%v24,%%v24,%%v26\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v3 \n\t"
|
"vfadb %%v24,%%v24,%%v27\n\t"
|
||||||
"vrepg %%v1,%%v0,1 \n\t"
|
"vfadb %%v24,%%v24,%%v28\n\t"
|
||||||
"adbr %%f0,%%f1 \n\t"
|
"vfadb %%v24,%%v24,%%v29\n\t"
|
||||||
"ldr %0,%%f0 "
|
"vfadb %%v24,%%v24,%%v30\n\t"
|
||||||
:"=f"(asum)
|
"vfadb %%v24,%%v24,%%v31\n\t"
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
"vrepg %%v25,%%v24,1\n\t"
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
|
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||||
);
|
"vsteg %%v24,%[asum],0"
|
||||||
|
: [asum] "=Q"(asum),[n] "+&r"(n)
|
||||||
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
|
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||||
|
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return asum;
|
return asum;
|
||||||
}
|
}
|
||||||
|
|
@ -123,7 +119,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT sumf = 0.0;
|
FLOAT sumf = 0.0;
|
||||||
BLASLONG n1;
|
BLASLONG n1;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return sumf;
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return sumf;
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -164,9 +161,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
return sumf;
|
return sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,107 +27,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
|
||||||
{
|
__asm__("vlrepg %%v0,%[alpha]\n\t"
|
||||||
__asm__ volatile(
|
"srlg %[n],%[n],5\n\t"
|
||||||
"vlrepg %%v0,%3 \n\t"
|
|
||||||
"srlg %%r0,%0,5 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%1) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%1) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%1) \n\t"
|
"vl %%v20,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v20,0(%%r1,%2) \n\t"
|
"vl %%v21,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v21,16(%%r1,%2) \n\t"
|
"vl %%v22,32(%%r1,%[y])\n\t"
|
||||||
"vl %%v22,32(%%r1,%2) \n\t"
|
"vl %%v23,48(%%r1,%[y])\n\t"
|
||||||
"vl %%v23,48(%%r1,%2) \n\t"
|
"vl %%v24,64(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25,80(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v26,96(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v27,112(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v28,64(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v29,80(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v30,96(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v31,112(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
|
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
|
||||||
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
|
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
|
||||||
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
|
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
|
||||||
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
|
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
|
||||||
|
"vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
|
||||||
"vl %%v24,64(%%r1,%1) \n\t"
|
"vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
|
||||||
"vl %%v25,80(%%r1,%1) \n\t"
|
"vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
|
||||||
"vl %%v26,96(%%r1,%1) \n\t"
|
"vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
|
||||||
"vl %%v27,112(%%r1,%1) \n\t"
|
"vst %%v16,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v28,64(%%r1,%2) \n\t"
|
"vst %%v17,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v29,80(%%r1,%2) \n\t"
|
"vst %%v18,32(%%r1,%[y])\n\t"
|
||||||
"vl %%v30,96(%%r1,%2) \n\t"
|
"vst %%v19,48(%%r1,%[y])\n\t"
|
||||||
"vl %%v31,112(%%r1,%2) \n\t"
|
"vst %%v24,64(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v25,80(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
|
"vst %%v26,96(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
|
"vst %%v27,112(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vst %%v16,0(%%r1,%2) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vst %%v17,16(%%r1,%2) \n\t"
|
"vl %%v20,128(%%r1,%[y])\n\t"
|
||||||
"vst %%v18,32(%%r1,%2) \n\t"
|
"vl %%v21,144(%%r1,%[y])\n\t"
|
||||||
"vst %%v19,48(%%r1,%2) \n\t"
|
"vl %%v22,160(%%r1,%[y])\n\t"
|
||||||
"vst %%v20,64(%%r1,%2) \n\t"
|
"vl %%v23,176(%%r1,%[y])\n\t"
|
||||||
"vst %%v21,80(%%r1,%2) \n\t"
|
"vl %%v24,192(%%r1,%[x])\n\t"
|
||||||
"vst %%v22,96(%%r1,%2) \n\t"
|
"vl %%v25,208(%%r1,%[x])\n\t"
|
||||||
"vst %%v23,112(%%r1,%2) \n\t"
|
"vl %%v26,224(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v27,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%1) \n\t"
|
"vl %%v28,192(%%r1,%[y])\n\t"
|
||||||
"vl %%v17,144(%%r1,%1) \n\t"
|
"vl %%v29,208(%%r1,%[y])\n\t"
|
||||||
"vl %%v18,160(%%r1,%1) \n\t"
|
"vl %%v30,224(%%r1,%[y])\n\t"
|
||||||
"vl %%v19,176(%%r1,%1) \n\t"
|
"vl %%v31,240(%%r1,%[y])\n\t"
|
||||||
"vl %%v20,128(%%r1,%2) \n\t"
|
|
||||||
"vl %%v21,144(%%r1,%2) \n\t"
|
|
||||||
"vl %%v22,160(%%r1,%2) \n\t"
|
|
||||||
"vl %%v23,176(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
|
"vfmadb %%v16,%%v0,%%v16,%%v20\n\t"
|
||||||
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
|
"vfmadb %%v17,%%v0,%%v17,%%v21\n\t"
|
||||||
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
|
"vfmadb %%v18,%%v0,%%v18,%%v22\n\t"
|
||||||
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
|
"vfmadb %%v19,%%v0,%%v19,%%v23\n\t"
|
||||||
|
"vfmadb %%v24,%%v0,%%v24,%%v28\n\t"
|
||||||
"vl %%v24,192(%%r1,%1) \n\t"
|
"vfmadb %%v25,%%v0,%%v25,%%v29\n\t"
|
||||||
"vl %%v25,208(%%r1,%1) \n\t"
|
"vfmadb %%v26,%%v0,%%v26,%%v30\n\t"
|
||||||
"vl %%v26,224(%%r1,%1) \n\t"
|
"vfmadb %%v27,%%v0,%%v27,%%v31\n\t"
|
||||||
"vl %%v27,240(%%r1,%1) \n\t"
|
"vst %%v16,128(%%r1,%[y])\n\t"
|
||||||
"vl %%v28,192(%%r1,%2) \n\t"
|
"vst %%v17,144(%%r1,%[y])\n\t"
|
||||||
"vl %%v29,208(%%r1,%2) \n\t"
|
"vst %%v18,160(%%r1,%[y])\n\t"
|
||||||
"vl %%v30,224(%%r1,%2) \n\t"
|
"vst %%v19,176(%%r1,%[y])\n\t"
|
||||||
"vl %%v31,240(%%r1,%2) \n\t"
|
"vst %%v24,192(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v25,208(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
|
"vst %%v26,224(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
|
"vst %%v27,240(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
|
|
||||||
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
|
|
||||||
|
|
||||||
"vst %%v16,128(%%r1,%2) \n\t"
|
|
||||||
"vst %%v17,144(%%r1,%2) \n\t"
|
|
||||||
"vst %%v18,160(%%r1,%2) \n\t"
|
|
||||||
"vst %%v19,176(%%r1,%2) \n\t"
|
|
||||||
"vst %%v20,192(%%r1,%2) \n\t"
|
|
||||||
"vst %%v21,208(%%r1,%2) \n\t"
|
|
||||||
"vst %%v22,224(%%r1,%2) \n\t"
|
|
||||||
"vst %%v23,240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
[alpha] "Q"(*alpha)
|
||||||
);
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
|
||||||
{
|
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||||
|
BLASLONG dummy2) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
|
|
||||||
if ( n <= 0 ) return 0 ;
|
if (n <= 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1) )
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -32;
|
BLASLONG n1 = n & -32;
|
||||||
|
|
||||||
|
|
@ -135,8 +124,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
daxpy_kernel_32(n1, x, y, &da);
|
daxpy_kernel_32(n1, x, y, &da);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
y[i] += da * x[i];
|
y[i] += da * x[i];
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -144,13 +132,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BLASLONG n1 = n & -4;
|
BLASLONG n1 = n & -4;
|
||||||
|
|
||||||
while(i < n1)
|
while (i < n1) {
|
||||||
{
|
|
||||||
|
|
||||||
FLOAT m1 = da * x[ix];
|
FLOAT m1 = da * x[ix];
|
||||||
FLOAT m2 = da * x[ix + inc_x];
|
FLOAT m2 = da * x[ix + inc_x];
|
||||||
|
|
@ -168,8 +154,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
y[iy] += da * x[ix];
|
y[iy] += da * x[ix];
|
||||||
ix += inc_x;
|
ix += inc_x;
|
||||||
|
|
@ -180,5 +165,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,30 +27,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||||
{
|
__asm__("srlg %[n],%[n],5\n\t"
|
||||||
__asm__ volatile (
|
|
||||||
"lgr %%r1,%1 \n\t"
|
|
||||||
"lgr %%r2,%2 \n\t"
|
|
||||||
"srlg %%r0,%0,5 \n\t"
|
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1) \n\t"
|
"pfd 1, 1024(%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r2) \n\t"
|
"pfd 2, 1024(%[y])\n\t"
|
||||||
"mvc 0(256,%%r2),0(%%r1) \n\t"
|
"mvc 0(256,%[y]),0(%[x])\n\t"
|
||||||
"agfi %%r1,256 \n\t"
|
"la %[x],256(%[x])\n\t"
|
||||||
"agfi %%r2,256 \n\t"
|
"la %[y],256(%[y])\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
|
||||||
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
|
: "m"(*(const struct { FLOAT x[n]; } *) x)
|
||||||
:"memory","cc","r0","r1","r2"
|
: "cc");
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
|
|
||||||
if (n <= 0) return 0;
|
if (n <= 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
if ((inc_x == 1) && (inc_y == 1)) {
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
|
|
||||||
|
|
@ -66,7 +62,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
while (i < n) {
|
while (i < n) {
|
||||||
|
|
@ -81,5 +76,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,68 +27,78 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||||
{
|
|
||||||
FLOAT dot;
|
FLOAT dot;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vzero %%v0\n\t"
|
||||||
"vzero %%v0 \n\t"
|
"vzero %%v1\n\t"
|
||||||
"srlg %%r0,%1,4 \n\t"
|
"vzero %%v2\n\t"
|
||||||
|
"vzero %%v3\n\t"
|
||||||
|
"vzero %%v4\n\t"
|
||||||
|
"vzero %%v5\n\t"
|
||||||
|
"vzero %%v6\n\t"
|
||||||
|
"vzero %%v7\n\t"
|
||||||
|
"srlg %[n],%[n],4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[x])\n\t"
|
||||||
"pfd 1,1024(%%r1,%3) \n\t"
|
"pfd 1,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
"vl %%v24,0(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v25,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v24,0(%%r1,%3) \n\t"
|
"vl %%v26,32(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v27,48(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v28,64(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v29,80(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v30,96(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v31,112(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
|
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
|
||||||
"vl %%v25,16(%%r1,%3) \n\t"
|
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
|
||||||
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
|
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
|
||||||
"vl %%v26,32(%%r1,%3) \n\t"
|
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
|
||||||
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
|
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
|
||||||
"vl %%v27,48(%%r1,%3) \n\t"
|
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
|
||||||
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
|
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
|
||||||
"vl %%v28,64(%%r1,%3) \n\t"
|
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
|
||||||
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
|
|
||||||
"vl %%v29,80(%%r1,%3) \n\t"
|
|
||||||
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
|
|
||||||
"vl %%v30,96(%%r1,%3) \n\t"
|
|
||||||
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
|
|
||||||
"vl %%v31,112(%%r1,%3) \n\t"
|
|
||||||
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v1\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v2\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v3\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v4\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v5\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v6\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v7\n\t"
|
||||||
"vrepg %%v1,%%v0,1\n\t"
|
"vrepg %%v1,%%v0,1\n\t"
|
||||||
"adbr %%f0,%%f1\n\t"
|
"adbr %%f0,%%f1\n\t"
|
||||||
"ldr %0,%%f0 "
|
"ldr %[dot],%%f0"
|
||||||
:"=f"(dot)
|
: [dot] "=f"(dot),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
|
||||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return dot;
|
return dot;
|
||||||
}
|
}
|
||||||
|
|
||||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||||
{
|
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
|
|
||||||
FLOAT dot = 0.0;
|
FLOAT dot = 0.0;
|
||||||
|
|
||||||
if ( n <= 0 ) return(dot);
|
if (n <= 0)
|
||||||
|
return (dot);
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1) )
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -16;
|
BLASLONG n1 = n & -16;
|
||||||
|
|
||||||
|
|
@ -96,8 +106,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
dot = ddot_kernel_16(n1, x, y);
|
dot = ddot_kernel_16(n1, x, y);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
dot += y[i] * x[i];
|
dot += y[i] * x[i];
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -105,7 +114,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
}
|
}
|
||||||
return (dot);
|
return (dot);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
FLOAT temp1 = 0.0;
|
FLOAT temp1 = 0.0;
|
||||||
|
|
@ -113,8 +121,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
|
|
||||||
BLASLONG n1 = n & -4;
|
BLASLONG n1 = n & -4;
|
||||||
|
|
||||||
while(i < n1)
|
while (i < n1) {
|
||||||
{
|
|
||||||
|
|
||||||
FLOAT m1 = y[iy] * x[ix];
|
FLOAT m1 = y[iy] * x[ix];
|
||||||
FLOAT m2 = y[iy + inc_y] * x[ix + inc_x];
|
FLOAT m2 = y[iy + inc_y] * x[ix + inc_x];
|
||||||
|
|
@ -132,8 +139,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
temp1 += y[iy] * x[ix];
|
temp1 += y[iy] * x[ix];
|
||||||
ix += inc_x;
|
ix += inc_x;
|
||||||
|
|
@ -145,5 +151,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
return (dot);
|
return (dot);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2017, The OpenBLAS Project
|
Copyright (c) 2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -29,387 +29,349 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define NBMAX 2048
|
#define NBMAX 2048
|
||||||
|
|
||||||
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
|
||||||
{
|
FLOAT *alpha) {
|
||||||
__asm__ volatile (
|
register FLOAT *ap0 = ap[0];
|
||||||
"vlrepg %%v0,0(%5) \n\t"
|
register FLOAT *ap1 = ap[1];
|
||||||
"vlrepg %%v1,8(%5) \n\t"
|
register FLOAT *ap2 = ap[2];
|
||||||
"vlrepg %%v2,16(%5) \n\t"
|
register FLOAT *ap3 = ap[3];
|
||||||
"vlrepg %%v3,24(%5) \n\t"
|
|
||||||
"vlrepg %%v4,%7 \n\t"
|
__asm__("vlrepg %%v0,0(%[x])\n\t"
|
||||||
|
"vlrepg %%v1,8(%[x])\n\t"
|
||||||
|
"vlrepg %%v2,16(%[x])\n\t"
|
||||||
|
"vlrepg %%v3,24(%[x])\n\t"
|
||||||
|
"vlrepg %%v4,%[alpha]\n\t"
|
||||||
"vfmdb %%v0,%%v0,%%v4\n\t"
|
"vfmdb %%v0,%%v0,%%v4\n\t"
|
||||||
"vfmdb %%v1,%%v1,%%v4\n\t"
|
"vfmdb %%v1,%%v1,%%v4\n\t"
|
||||||
"vfmdb %%v2,%%v2,%%v4\n\t"
|
"vfmdb %%v2,%%v2,%%v4\n\t"
|
||||||
"vfmdb %%v3,%%v3,%%v4\n\t"
|
"vfmdb %%v3,%%v3,%%v4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
|
|
||||||
"lghi %%r0,-16\n\t"
|
"lghi %%r0,-16\n\t"
|
||||||
"ngr %%r0,%0 \n\t"
|
"ngr %%r0,%[n]\n\t"
|
||||||
"ltgr %%r0,%%r0\n\t"
|
"ltgr %%r0,%%r0\n\t"
|
||||||
"jz 1f\n\t"
|
"jz 1f\n\t"
|
||||||
|
|
||||||
"srlg %%r0,%%r0,4\n\t"
|
"srlg %%r0,%%r0,4\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||||
"pfd 1,1024(%%r1,%3) \n\t"
|
"pfd 1,1024(%%r1,%[ap2])\n\t"
|
||||||
"pfd 1,1024(%%r1,%4) \n\t"
|
"pfd 1,1024(%%r1,%[ap3])\n\t"
|
||||||
"pfd 2,1024(%%r1,%6) \n\t"
|
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v17,0(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v17,0(%%r1,%2) \n\t"
|
"vl %%v18,0(%%r1,%[ap2])\n\t"
|
||||||
"vl %%v18,0(%%r1,%3) \n\t"
|
"vl %%v19,0(%%r1,%[ap3])\n\t"
|
||||||
"vl %%v19,0(%%r1,%4) \n\t"
|
"vl %%v20,16(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v20,16(%%r1,%1) \n\t"
|
"vl %%v21,16(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v21,16(%%r1,%2) \n\t"
|
"vl %%v22,16(%%r1,%[ap2])\n\t"
|
||||||
"vl %%v22,16(%%r1,%3) \n\t"
|
"vl %%v23,16(%%r1,%[ap3])\n\t"
|
||||||
"vl %%v23,16(%%r1,%4) \n\t"
|
"vl %%v24,32(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v24,32(%%r1,%1) \n\t"
|
"vl %%v25,32(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v25,32(%%r1,%2) \n\t"
|
"vl %%v26,32(%%r1,%[ap2])\n\t"
|
||||||
"vl %%v26,32(%%r1,%3) \n\t"
|
"vl %%v27,32(%%r1,%[ap3])\n\t"
|
||||||
"vl %%v27,32(%%r1,%4) \n\t"
|
"vl %%v28,48(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v28,48(%%r1,%1) \n\t"
|
"vl %%v29,48(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v29,48(%%r1,%2) \n\t"
|
"vl %%v30,48(%%r1,%[ap2])\n\t"
|
||||||
"vl %%v30,48(%%r1,%3) \n\t"
|
"vl %%v31,48(%%r1,%[ap3])\n\t"
|
||||||
"vl %%v31,48(%%r1,%4) \n\t"
|
"vl %%v4,0(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v5,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v4,0(%%r1,%6) \n\t"
|
"vl %%v6,32(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v7,48(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
|
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
|
||||||
|
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
|
||||||
|
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
|
||||||
|
"vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
|
||||||
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
|
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
|
||||||
|
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
|
||||||
|
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
|
||||||
|
"vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
|
||||||
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
|
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
|
||||||
|
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
|
||||||
|
"vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
|
||||||
|
"vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
|
||||||
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
|
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
|
||||||
"vst %%v4,0(%%r1,%6) \n\t"
|
"vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
|
||||||
|
"vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
|
||||||
"vl %%v4,16(%%r1,%6) \n\t"
|
"vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
|
||||||
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
|
"vst %%v4,0(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
|
"vst %%v5,16(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
|
"vst %%v6,32(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
|
"vst %%v7,48(%%r1,%[y])\n\t"
|
||||||
"vst %%v4,16(%%r1,%6) \n\t"
|
"vl %%v16,64(%%r1,%[ap0])\n\t"
|
||||||
|
"vl %%v17,64(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v4,32(%%r1,%6) \n\t"
|
"vl %%v18,64(%%r1,%[ap2])\n\t"
|
||||||
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
|
"vl %%v19,64(%%r1,%[ap3])\n\t"
|
||||||
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
|
"vl %%v20,80(%%r1,%[ap0])\n\t"
|
||||||
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
|
"vl %%v21,80(%%r1,%[ap1])\n\t"
|
||||||
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
|
"vl %%v22,80(%%r1,%[ap2])\n\t"
|
||||||
"vst %%v4,32(%%r1,%6) \n\t"
|
"vl %%v23,80(%%r1,%[ap3])\n\t"
|
||||||
|
"vl %%v24,96(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v4,48(%%r1,%6) \n\t"
|
"vl %%v25,96(%%r1,%[ap1])\n\t"
|
||||||
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
|
"vl %%v26,96(%%r1,%[ap2])\n\t"
|
||||||
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
|
"vl %%v27,96(%%r1,%[ap3])\n\t"
|
||||||
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
|
"vl %%v28,112(%%r1,%[ap0])\n\t"
|
||||||
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
|
"vl %%v29,112(%%r1,%[ap1])\n\t"
|
||||||
"vst %%v4,48(%%r1,%6) \n\t"
|
"vl %%v30,112(%%r1,%[ap2])\n\t"
|
||||||
|
"vl %%v31,112(%%r1,%[ap3])\n\t"
|
||||||
"vl %%v16,64(%%r1,%1) \n\t"
|
"vl %%v4,64(%%r1,%[y])\n\t"
|
||||||
"vl %%v17,64(%%r1,%2) \n\t"
|
"vl %%v5,80(%%r1,%[y])\n\t"
|
||||||
"vl %%v18,64(%%r1,%3) \n\t"
|
"vl %%v6,96(%%r1,%[y])\n\t"
|
||||||
"vl %%v19,64(%%r1,%4) \n\t"
|
"vl %%v7,112(%%r1,%[y])\n\t"
|
||||||
"vl %%v20,80(%%r1,%1) \n\t"
|
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
|
||||||
"vl %%v22,80(%%r1,%3) \n\t"
|
|
||||||
"vl %%v23,80(%%r1,%4) \n\t"
|
|
||||||
"vl %%v24,96(%%r1,%1) \n\t"
|
|
||||||
"vl %%v25,96(%%r1,%2) \n\t"
|
|
||||||
"vl %%v26,96(%%r1,%3) \n\t"
|
|
||||||
"vl %%v27,96(%%r1,%4) \n\t"
|
|
||||||
"vl %%v28,112(%%r1,%1) \n\t"
|
|
||||||
"vl %%v29,112(%%r1,%2) \n\t"
|
|
||||||
"vl %%v30,112(%%r1,%3) \n\t"
|
|
||||||
"vl %%v31,112(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"vl %%v4,64(%%r1,%6) \n\t"
|
|
||||||
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
|
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
|
||||||
|
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
|
||||||
|
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
|
||||||
|
"vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
|
||||||
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
|
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
|
||||||
|
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
|
||||||
|
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
|
||||||
|
"vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
|
||||||
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
|
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
|
||||||
|
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
|
||||||
|
"vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
|
||||||
|
"vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
|
||||||
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
|
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
|
||||||
"vst %%v4,64(%%r1,%6) \n\t"
|
"vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
|
||||||
|
"vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
|
||||||
"vl %%v4,80(%%r1,%6) \n\t"
|
"vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
|
||||||
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
|
"vst %%v4,64(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
|
"vst %%v5,80(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
|
"vst %%v6,96(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
|
"vst %%v7,112(%%r1,%[y])\n\t"
|
||||||
"vst %%v4,80(%%r1,%6) \n\t"
|
|
||||||
|
|
||||||
"vl %%v4,96(%%r1,%6) \n\t"
|
|
||||||
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
|
|
||||||
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
|
|
||||||
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
|
|
||||||
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
|
|
||||||
"vst %%v4,96(%%r1,%6) \n\t"
|
|
||||||
|
|
||||||
"vl %%v4,112(%%r1,%6) \n\t"
|
|
||||||
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
|
|
||||||
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
|
|
||||||
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
|
|
||||||
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
|
|
||||||
"vst %%v4,112(%%r1,%6) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b\n\t"
|
"brctg %%r0,0b\n\t"
|
||||||
|
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"lghi %%r0,12\n\t"
|
"lghi %%r0,12\n\t"
|
||||||
"ngr %%r0,%0 \n\t"
|
"ngr %%r0,%[n]\n\t"
|
||||||
"ltgr %%r0,%%r0\n\t"
|
"ltgr %%r0,%%r0\n\t"
|
||||||
"jz 3f\n\t"
|
"jz 3f\n\t"
|
||||||
|
|
||||||
"srlg %%r0,%%r0,2\n\t"
|
"srlg %%r0,%%r0,2\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v16,0(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v17,0(%%r1,%2) \n\t"
|
"vl %%v17,0(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v18,0(%%r1,%3) \n\t"
|
"vl %%v18,0(%%r1,%[ap2])\n\t"
|
||||||
"vl %%v19,0(%%r1,%4) \n\t"
|
"vl %%v19,0(%%r1,%[ap3])\n\t"
|
||||||
"vl %%v20,16(%%r1,%1) \n\t"
|
"vl %%v20,16(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v21,16(%%r1,%2) \n\t"
|
"vl %%v21,16(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v22,16(%%r1,%3) \n\t"
|
"vl %%v22,16(%%r1,%[ap2])\n\t"
|
||||||
"vl %%v23,16(%%r1,%4) \n\t"
|
"vl %%v23,16(%%r1,%[ap3])\n\t"
|
||||||
|
"vl %%v4,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v4,0(%%r1,%6) \n\t"
|
"vl %%v5,16(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
|
"vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
|
||||||
|
"vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
|
||||||
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
|
"vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
|
||||||
|
"vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
|
||||||
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
|
"vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
|
||||||
|
"vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
|
||||||
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
|
"vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
|
||||||
"vst %%v4,0(%%r1,%6) \n\t"
|
"vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
|
||||||
|
"vst %%v4,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v4,16(%%r1,%6) \n\t"
|
"vst %%v5,16(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
|
|
||||||
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
|
|
||||||
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
|
|
||||||
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
|
|
||||||
"vst %%v4,16(%%r1,%6) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,32\n\t"
|
"agfi %%r1,32\n\t"
|
||||||
"brctg %%r0,2b\n\t"
|
"brctg %%r0,2b\n\t"
|
||||||
|
|
||||||
"3:\n\t"
|
"3:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) y)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
|
||||||
);
|
"m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
|
||||||
|
"m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
|
||||||
|
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
|
||||||
|
[n] "r"(n)
|
||||||
|
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||||
|
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
|
||||||
|
"v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
|
||||||
{
|
FLOAT *alpha) {
|
||||||
__asm__ volatile (
|
register FLOAT *ap0 = ap[0];
|
||||||
"vlrepg %%v0,0(%3) \n\t"
|
register FLOAT *ap1 = ap[1];
|
||||||
"vlrepg %%v1,8(%3) \n\t"
|
|
||||||
"vlrepg %%v2,%5 \n\t"
|
__asm__("vlrepg %%v0,0(%[x])\n\t"
|
||||||
|
"vlrepg %%v1,8(%[x])\n\t"
|
||||||
|
"vlrepg %%v2,%[alpha]\n\t"
|
||||||
"vfmdb %%v0,%%v0,%%v2\n\t"
|
"vfmdb %%v0,%%v0,%%v2\n\t"
|
||||||
"vfmdb %%v1,%%v1,%%v2\n\t"
|
"vfmdb %%v1,%%v1,%%v2\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
|
|
||||||
"lghi %%r0,-16\n\t"
|
"lghi %%r0,-16\n\t"
|
||||||
"ngr %%r0,%0 \n\t"
|
"ngr %%r0,%[n]\n\t"
|
||||||
"ltgr %%r0,%%r0\n\t"
|
"ltgr %%r0,%%r0\n\t"
|
||||||
"jz 1f\n\t"
|
"jz 1f\n\t"
|
||||||
|
|
||||||
"srlg %%r0,%%r0,4\n\t"
|
"srlg %%r0,%%r0,4\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||||
"pfd 2,1024(%%r1,%4) \n\t"
|
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v17,0(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v17,0(%%r1,%2) \n\t"
|
"vl %%v18,16(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v18,16(%%r1,%1) \n\t"
|
"vl %%v19,16(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v19,16(%%r1,%2) \n\t"
|
"vl %%v20,32(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v20,32(%%r1,%1) \n\t"
|
"vl %%v21,32(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v21,32(%%r1,%2) \n\t"
|
"vl %%v22,48(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v22,48(%%r1,%1) \n\t"
|
"vl %%v23,48(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v23,48(%%r1,%2) \n\t"
|
"vl %%v24,64(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v24,64(%%r1,%1) \n\t"
|
"vl %%v25,64(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v25,64(%%r1,%2) \n\t"
|
"vl %%v26,80(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v26,80(%%r1,%1) \n\t"
|
"vl %%v27,80(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v27,80(%%r1,%2) \n\t"
|
"vl %%v28,96(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v28,96(%%r1,%1) \n\t"
|
"vl %%v29,96(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v29,96(%%r1,%2) \n\t"
|
"vl %%v30,112(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v30,112(%%r1,%1) \n\t"
|
"vl %%v31,112(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v31,112(%%r1,%2) \n\t"
|
"vl %%v2,0(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v3,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v2,0(%%r1,%4) \n\t"
|
"vl %%v4,32(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v5,48(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v6,64(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v7,80(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v8,96(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v9,112(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
|
"vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
|
||||||
|
"vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
|
||||||
|
"vfmadb %%v4,%%v20,%%v0,%%v4\n\t"
|
||||||
|
"vfmadb %%v5,%%v22,%%v0,%%v5\n\t"
|
||||||
|
"vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
|
||||||
|
"vfmadb %%v7,%%v26,%%v0,%%v7\n\t"
|
||||||
|
"vfmadb %%v8,%%v28,%%v0,%%v8\n\t"
|
||||||
|
"vfmadb %%v9,%%v30,%%v0,%%v9\n\t"
|
||||||
"vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
|
"vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
|
||||||
"vst %%v2,0(%%r1,%4) \n\t"
|
"vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
|
||||||
|
"vfmadb %%v4,%%v21,%%v1,%%v4\n\t"
|
||||||
"vl %%v2,16(%%r1,%4) \n\t"
|
"vfmadb %%v5,%%v23,%%v1,%%v5\n\t"
|
||||||
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
|
"vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
|
||||||
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
|
"vfmadb %%v7,%%v27,%%v1,%%v7\n\t"
|
||||||
"vst %%v2,16(%%r1,%4) \n\t"
|
"vfmadb %%v8,%%v29,%%v1,%%v8\n\t"
|
||||||
|
"vfmadb %%v9,%%v31,%%v1,%%v9\n\t"
|
||||||
"vl %%v2,32(%%r1,%4) \n\t"
|
"vst %%v2,0(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v2,%%v20,%%v0,%%v2 \n\t"
|
"vst %%v3,16(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v2,%%v21,%%v1,%%v2 \n\t"
|
"vst %%v4,32(%%r1,%[y])\n\t"
|
||||||
"vst %%v2,32(%%r1,%4) \n\t"
|
"vst %%v5,48(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v6,64(%%r1,%[y])\n\t"
|
||||||
"vl %%v2,48(%%r1,%4) \n\t"
|
"vst %%v7,80(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v2,%%v22,%%v0,%%v2 \n\t"
|
"vst %%v8,96(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v2,%%v23,%%v1,%%v2 \n\t"
|
"vst %%v9,112(%%r1,%[y])\n\t"
|
||||||
"vst %%v2,48(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"vl %%v2,64(%%r1,%4) \n\t"
|
|
||||||
"vfmadb %%v2,%%v24,%%v0,%%v2 \n\t"
|
|
||||||
"vfmadb %%v2,%%v25,%%v1,%%v2 \n\t"
|
|
||||||
"vst %%v2,64(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"vl %%v2,80(%%r1,%4) \n\t"
|
|
||||||
"vfmadb %%v2,%%v26,%%v0,%%v2 \n\t"
|
|
||||||
"vfmadb %%v2,%%v27,%%v1,%%v2 \n\t"
|
|
||||||
"vst %%v2,80(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"vl %%v2,96(%%r1,%4) \n\t"
|
|
||||||
"vfmadb %%v2,%%v28,%%v0,%%v2 \n\t"
|
|
||||||
"vfmadb %%v2,%%v29,%%v1,%%v2 \n\t"
|
|
||||||
"vst %%v2,96(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"vl %%v2,112(%%r1,%4) \n\t"
|
|
||||||
"vfmadb %%v2,%%v30,%%v0,%%v2 \n\t"
|
|
||||||
"vfmadb %%v2,%%v31,%%v1,%%v2 \n\t"
|
|
||||||
"vst %%v2,112(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b\n\t"
|
"brctg %%r0,0b\n\t"
|
||||||
|
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"lghi %%r0,12\n\t"
|
"lghi %%r0,12\n\t"
|
||||||
"ngr %%r0,%0 \n\t"
|
"ngr %%r0,%[n]\n\t"
|
||||||
"ltgr %%r0,%%r0\n\t"
|
"ltgr %%r0,%%r0\n\t"
|
||||||
"jz 3f\n\t"
|
"jz 3f\n\t"
|
||||||
|
|
||||||
"srlg %%r0,%%r0,2\n\t"
|
"srlg %%r0,%%r0,2\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v16,0(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v17,0(%%r1,%2) \n\t"
|
"vl %%v17,0(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v18,16(%%r1,%1) \n\t"
|
"vl %%v18,16(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v19,16(%%r1,%2) \n\t"
|
"vl %%v19,16(%%r1,%[ap1])\n\t"
|
||||||
|
"vl %%v2,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v2,0(%%r1,%4) \n\t"
|
"vl %%v3,16(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
|
"vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
|
||||||
|
"vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
|
||||||
"vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
|
"vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
|
||||||
"vst %%v2,0(%%r1,%4) \n\t"
|
"vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
|
||||||
|
"vst %%v2,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v2,16(%%r1,%4) \n\t"
|
"vst %%v3,16(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
|
|
||||||
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
|
|
||||||
"vst %%v2,16(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,32\n\t"
|
"agfi %%r1,32\n\t"
|
||||||
"brctg %%r0,2b\n\t"
|
"brctg %%r0,2b\n\t"
|
||||||
|
|
||||||
"3:\n\t"
|
"3:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) y)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
|
||||||
);
|
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
|
||||||
|
[n] "r"(n)
|
||||||
|
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||||
|
"v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||||
|
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y,
|
||||||
{
|
FLOAT *alpha) {
|
||||||
__asm__ volatile (
|
__asm__("vlrepg %%v0,0(%[x])\n\t"
|
||||||
"vlrepg %%v0,0(%2) \n\t"
|
"vlrepg %%v16,%[alpha]\n\t"
|
||||||
"vlrepg %%v1,%4 \n\t"
|
"vfmdb %%v0,%%v0,%%v16\n\t"
|
||||||
"vfmdb %%v0,%%v0,%%v1 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
|
|
||||||
"lghi %%r0,-16\n\t"
|
"lghi %%r0,-16\n\t"
|
||||||
"ngr %%r0,%0 \n\t"
|
"ngr %%r0,%[n]\n\t"
|
||||||
"ltgr %%r0,%%r0\n\t"
|
"ltgr %%r0,%%r0\n\t"
|
||||||
"jz 1f\n\t"
|
"jz 1f\n\t"
|
||||||
|
|
||||||
"srlg %%r0,%%r0,4\n\t"
|
"srlg %%r0,%%r0,4\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[a0])\n\t"
|
||||||
"pfd 2,1024(%%r1,%3) \n\t"
|
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[a0])\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v17,16(%%r1,%[a0])\n\t"
|
||||||
"vl %%v17,16(%%r1,%1) \n\t"
|
"vl %%v18,32(%%r1,%[a0])\n\t"
|
||||||
"vl %%v18,32(%%r1,%1) \n\t"
|
"vl %%v19,48(%%r1,%[a0])\n\t"
|
||||||
"vl %%v19,48(%%r1,%1) \n\t"
|
"vl %%v20,64(%%r1,%[a0])\n\t"
|
||||||
"vl %%v20,64(%%r1,%1) \n\t"
|
"vl %%v21,80(%%r1,%[a0])\n\t"
|
||||||
"vl %%v21,80(%%r1,%1) \n\t"
|
"vl %%v22,96(%%r1,%[a0])\n\t"
|
||||||
"vl %%v22,96(%%r1,%1) \n\t"
|
"vl %%v23,112(%%r1,%[a0])\n\t"
|
||||||
"vl %%v23,112(%%r1,%1) \n\t"
|
"vl %%v24,0(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v25,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v1,0(%%r1,%3) \n\t"
|
"vl %%v26,32(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
|
"vl %%v27,48(%%r1,%[y])\n\t"
|
||||||
"vst %%v1,0(%%r1,%3) \n\t"
|
"vl %%v28,64(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v29,80(%%r1,%[y])\n\t"
|
||||||
"vl %%v1,16(%%r1,%3) \n\t"
|
"vl %%v30,96(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
|
"vl %%v31,112(%%r1,%[y])\n\t"
|
||||||
"vst %%v1,16(%%r1,%3) \n\t"
|
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
|
||||||
|
"vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
|
||||||
"vl %%v1,32(%%r1,%3) \n\t"
|
"vfmadb %%v26,%%v18,%%v0,%%v26\n\t"
|
||||||
"vfmadb %%v1,%%v18,%%v0,%%v1 \n\t"
|
"vfmadb %%v27,%%v19,%%v0,%%v27\n\t"
|
||||||
"vst %%v1,32(%%r1,%3) \n\t"
|
"vfmadb %%v28,%%v20,%%v0,%%v28\n\t"
|
||||||
|
"vfmadb %%v29,%%v21,%%v0,%%v29\n\t"
|
||||||
"vl %%v1,48(%%r1,%3) \n\t"
|
"vfmadb %%v30,%%v22,%%v0,%%v30\n\t"
|
||||||
"vfmadb %%v1,%%v19,%%v0,%%v1 \n\t"
|
"vfmadb %%v31,%%v23,%%v0,%%v31\n\t"
|
||||||
"vst %%v1,48(%%r1,%3) \n\t"
|
"vst %%v24,0(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v25,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v1,64(%%r1,%3) \n\t"
|
"vst %%v26,32(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v1,%%v20,%%v0,%%v1 \n\t"
|
"vst %%v27,48(%%r1,%[y])\n\t"
|
||||||
"vst %%v1,64(%%r1,%3) \n\t"
|
"vst %%v28,64(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v29,80(%%r1,%[y])\n\t"
|
||||||
"vl %%v1,80(%%r1,%3) \n\t"
|
"vst %%v30,96(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v1,%%v21,%%v0,%%v1 \n\t"
|
"vst %%v31,112(%%r1,%[y])\n\t"
|
||||||
"vst %%v1,80(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"vl %%v1,96(%%r1,%3) \n\t"
|
|
||||||
"vfmadb %%v1,%%v22,%%v0,%%v1 \n\t"
|
|
||||||
"vst %%v1,96(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"vl %%v1,112(%%r1,%3) \n\t"
|
|
||||||
"vfmadb %%v1,%%v23,%%v0,%%v1 \n\t"
|
|
||||||
"vst %%v1,112(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b\n\t"
|
"brctg %%r0,0b\n\t"
|
||||||
|
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"lghi %%r0,12\n\t"
|
"lghi %%r0,12\n\t"
|
||||||
"ngr %%r0,%0 \n\t"
|
"ngr %%r0,%[n]\n\t"
|
||||||
"ltgr %%r0,%%r0\n\t"
|
"ltgr %%r0,%%r0\n\t"
|
||||||
"jz 3f\n\t"
|
"jz 3f\n\t"
|
||||||
|
|
||||||
"srlg %%r0,%%r0,2\n\t"
|
"srlg %%r0,%%r0,2\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v16,0(%%r1,%[a0])\n\t"
|
||||||
"vl %%v17,16(%%r1,%1) \n\t"
|
"vl %%v17,16(%%r1,%[a0])\n\t"
|
||||||
|
"vl %%v18,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v1,0(%%r1,%3) \n\t"
|
"vl %%v19,16(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
|
"vfmadb %%v18,%%v16,%%v0,%%v18\n\t"
|
||||||
"vst %%v1,0(%%r1,%3) \n\t"
|
"vfmadb %%v19,%%v17,%%v0,%%v19\n\t"
|
||||||
|
"vst %%v18,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v1,16(%%r1,%3) \n\t"
|
"vst %%v19,16(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
|
|
||||||
"vst %%v1,16(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,32\n\t"
|
"agfi %%r1,32\n\t"
|
||||||
"brctg %%r0,2b\n\t"
|
"brctg %%r0,2b\n\t"
|
||||||
|
|
||||||
"3:\n\t"
|
"3:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) y)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
|
||||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha),
|
||||||
);
|
[n] "r"(n)
|
||||||
|
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) {
|
||||||
{
|
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
for (i = 0; i < n; i++)
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
*dest += src[i];
|
*dest += src[i];
|
||||||
dest += inc_dest;
|
dest += inc_dest;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
|
||||||
{
|
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||||
|
FLOAT *buffer) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
FLOAT *a_ptr;
|
FLOAT *a_ptr;
|
||||||
FLOAT *x_ptr;
|
FLOAT *x_ptr;
|
||||||
|
|
@ -423,8 +385,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
BLASLONG lda4 = lda << 2;
|
BLASLONG lda4 = lda << 2;
|
||||||
FLOAT xbuffer[8], *ybuffer;
|
FLOAT xbuffer[8], *ybuffer;
|
||||||
|
|
||||||
if ( m < 1 ) return(0);
|
if (m < 1)
|
||||||
if ( n < 1 ) return(0);
|
return (0);
|
||||||
|
if (n < 1)
|
||||||
|
return (0);
|
||||||
|
|
||||||
ybuffer = buffer;
|
ybuffer = buffer;
|
||||||
|
|
||||||
|
|
@ -439,13 +403,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
|
|
||||||
BLASLONG NB = NBMAX;
|
BLASLONG NB = NBMAX;
|
||||||
|
|
||||||
while ( NB == NBMAX )
|
while (NB == NBMAX) {
|
||||||
{
|
|
||||||
|
|
||||||
m1 -= NB;
|
m1 -= NB;
|
||||||
if ( m1 < 0)
|
if (m1 < 0) {
|
||||||
{
|
if (m2 == 0)
|
||||||
if ( m2 == 0 ) break;
|
break;
|
||||||
NB = m2;
|
NB = m2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -462,12 +425,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
else
|
else
|
||||||
ybuffer = y_ptr;
|
ybuffer = y_ptr;
|
||||||
|
|
||||||
if ( inc_x == 1 )
|
if (inc_x == 1) {
|
||||||
{
|
|
||||||
|
|
||||||
|
for (i = 0; i < n1; i++) {
|
||||||
for( i = 0; i < n1 ; i++)
|
|
||||||
{
|
|
||||||
dgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha);
|
dgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha);
|
||||||
ap[0] += lda4;
|
ap[0] += lda4;
|
||||||
ap[1] += lda4;
|
ap[1] += lda4;
|
||||||
|
|
@ -477,29 +437,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
x_ptr += 4;
|
x_ptr += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( n2 & 2 )
|
if (n2 & 2) {
|
||||||
{
|
|
||||||
dgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha);
|
dgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha);
|
||||||
a_ptr += lda * 2;
|
a_ptr += lda * 2;
|
||||||
x_ptr += 2;
|
x_ptr += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (n2 & 1) {
|
||||||
if ( n2 & 1 )
|
|
||||||
{
|
|
||||||
dgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha);
|
dgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha);
|
||||||
/* a_ptr += lda;
|
/* a_ptr += lda;
|
||||||
x_ptr += 1; */
|
x_ptr += 1; */
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
for (i = 0; i < n1; i++) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n1 ; i++)
|
|
||||||
{
|
|
||||||
xbuffer[0] = x_ptr[0];
|
xbuffer[0] = x_ptr[0];
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
xbuffer[1] = x_ptr[0];
|
xbuffer[1] = x_ptr[0];
|
||||||
|
|
@ -516,8 +469,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
a_ptr += lda4;
|
a_ptr += lda4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for( i = 0; i < n2 ; i++)
|
for (i = 0; i < n2; i++) {
|
||||||
{
|
|
||||||
xbuffer[0] = x_ptr[0];
|
xbuffer[0] = x_ptr[0];
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha);
|
dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha);
|
||||||
|
|
@ -528,30 +480,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
}
|
}
|
||||||
|
|
||||||
a += NB;
|
a += NB;
|
||||||
if ( inc_y != 1 )
|
if (inc_y != 1) {
|
||||||
{
|
|
||||||
add_y(NB, ybuffer, y_ptr, inc_y);
|
add_y(NB, ybuffer, y_ptr, inc_y);
|
||||||
y_ptr += NB * inc_y;
|
y_ptr += NB * inc_y;
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
y_ptr += NB;
|
y_ptr += NB;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( m3 == 0 ) return(0);
|
if (m3 == 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( m3 == 3 )
|
if (m3 == 3) {
|
||||||
{
|
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
FLOAT temp0 = 0.0;
|
FLOAT temp0 = 0.0;
|
||||||
FLOAT temp1 = 0.0;
|
FLOAT temp1 = 0.0;
|
||||||
FLOAT temp2 = 0.0;
|
FLOAT temp2 = 0.0;
|
||||||
if ( lda == 3 && inc_x ==1 )
|
if (lda == 3 && inc_x == 1) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < ( n & -4 ); i+=4 )
|
for (i = 0; i < (n & -4); i += 4) {
|
||||||
{
|
|
||||||
|
|
||||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
|
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
|
||||||
|
|
@ -565,8 +513,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
x_ptr += 4;
|
x_ptr += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for( ; i < n; i++ )
|
for (; i < n; i++) {
|
||||||
{
|
|
||||||
temp0 += a_ptr[0] * x_ptr[0];
|
temp0 += a_ptr[0] * x_ptr[0];
|
||||||
temp1 += a_ptr[1] * x_ptr[0];
|
temp1 += a_ptr[1] * x_ptr[0];
|
||||||
temp2 += a_ptr[2] * x_ptr[0];
|
temp2 += a_ptr[2] * x_ptr[0];
|
||||||
|
|
@ -574,19 +521,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
x_ptr++;
|
x_ptr++;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n; i++ )
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
temp0 += a_ptr[0] * x_ptr[0];
|
temp0 += a_ptr[0] * x_ptr[0];
|
||||||
temp1 += a_ptr[1] * x_ptr[0];
|
temp1 += a_ptr[1] * x_ptr[0];
|
||||||
temp2 += a_ptr[2] * x_ptr[0];
|
temp2 += a_ptr[2] * x_ptr[0];
|
||||||
a_ptr += lda;
|
a_ptr += lda;
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -598,18 +541,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m3 == 2) {
|
||||||
if ( m3 == 2 )
|
|
||||||
{
|
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
FLOAT temp0 = 0.0;
|
FLOAT temp0 = 0.0;
|
||||||
FLOAT temp1 = 0.0;
|
FLOAT temp1 = 0.0;
|
||||||
if ( lda == 2 && inc_x ==1 )
|
if (lda == 2 && inc_x == 1) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < (n & -4) ; i+=4 )
|
for (i = 0; i < (n & -4); i += 4) {
|
||||||
{
|
|
||||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
|
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
|
||||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
|
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
|
||||||
|
|
@ -619,27 +558,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (; i < n; i++) {
|
||||||
for( ; i < n; i++ )
|
|
||||||
{
|
|
||||||
temp0 += a_ptr[0] * x_ptr[0];
|
temp0 += a_ptr[0] * x_ptr[0];
|
||||||
temp1 += a_ptr[1] * x_ptr[0];
|
temp1 += a_ptr[1] * x_ptr[0];
|
||||||
a_ptr += 2;
|
a_ptr += 2;
|
||||||
x_ptr++;
|
x_ptr++;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n; i++ )
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
temp0 += a_ptr[0] * x_ptr[0];
|
temp0 += a_ptr[0] * x_ptr[0];
|
||||||
temp1 += a_ptr[1] * x_ptr[0];
|
temp1 += a_ptr[1] * x_ptr[0];
|
||||||
a_ptr += lda;
|
a_ptr += lda;
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -649,31 +582,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( m3 == 1 )
|
if (m3 == 1) {
|
||||||
{
|
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
FLOAT temp = 0.0;
|
FLOAT temp = 0.0;
|
||||||
if ( lda == 1 && inc_x ==1 )
|
if (lda == 1 && inc_x == 1) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < (n & -4); i+=4 )
|
for (i = 0; i < (n & -4); i += 4) {
|
||||||
{
|
temp +=
|
||||||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
|
a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i +
|
||||||
|
2] *
|
||||||
|
x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3];
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for( ; i < n; i++ )
|
for (; i < n; i++) {
|
||||||
{
|
|
||||||
temp += a_ptr[i] * x_ptr[i];
|
temp += a_ptr[i] * x_ptr[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n; i++ )
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
temp += a_ptr[0] * x_ptr[0];
|
temp += a_ptr[0] * x_ptr[0];
|
||||||
a_ptr += lda;
|
a_ptr += lda;
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
|
|
@ -684,8 +613,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,34 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
|
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT max;
|
FLOAT max;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%2) \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"srlg %%r0,%1,5 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
"vl %%v24,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24,128(%%r1,%2) \n\t"
|
"vl %%v25,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v25,144(%%r1,%2) \n\t"
|
"vl %%v26,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26,160(%%r1,%2) \n\t"
|
"vl %%v27,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v27,176(%%r1,%2) \n\t"
|
"vl %%v28,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,192(%%r1,%2) \n\t"
|
"vl %%v29,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v29,208(%%r1,%2) \n\t"
|
"vl %%v30,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,224(%%r1,%2) \n\t"
|
"vl %%v31,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31,240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
|
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
|
||||||
"vfmaxdb %%v17,%%v17,%%v25,0\n\t"
|
"vfmaxdb %%v17,%%v17,%%v25,0\n\t"
|
||||||
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
|
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
|
||||||
|
|
@ -63,29 +59,23 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vfmaxdb %%v21,%%v21,%%v29,0\n\t"
|
"vfmaxdb %%v21,%%v21,%%v29,0\n\t"
|
||||||
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
|
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
|
||||||
"vfmaxdb %%v23,%%v23,%%v31,0\n\t"
|
"vfmaxdb %%v23,%%v23,%%v31,0\n\t"
|
||||||
|
|
||||||
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
|
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
|
||||||
"vfmaxdb %%v17,%%v17,%%v21,0\n\t"
|
"vfmaxdb %%v17,%%v17,%%v21,0\n\t"
|
||||||
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
|
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
|
||||||
"vfmaxdb %%v19,%%v19,%%v23,0\n\t"
|
"vfmaxdb %%v19,%%v19,%%v23,0\n\t"
|
||||||
|
|
||||||
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
|
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
|
||||||
"vfmaxdb %%v17,%%v17,%%v19,0\n\t"
|
"vfmaxdb %%v17,%%v17,%%v19,0\n\t"
|
||||||
|
|
||||||
"vfmaxdb %%v16,%%v16,%%v17,0\n\t"
|
"vfmaxdb %%v16,%%v16,%%v17,0\n\t"
|
||||||
|
|
||||||
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
|
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v16,%%v0,1\n\t"
|
"vrepg %%v16,%%v0,1\n\t"
|
||||||
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
|
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
|
||||||
"ldr %0,%%f0 "
|
"ldr %[max],%%f0"
|
||||||
:"=f"(max)
|
: [max] "=f"(max),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return max;
|
return max;
|
||||||
}
|
}
|
||||||
|
|
@ -95,7 +85,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
BLASLONG j = 0;
|
BLASLONG j = 0;
|
||||||
FLOAT maxf = 0.0;
|
FLOAT maxf = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (maxf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -105,9 +96,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
maxf = dmax_kernel_32(n1, x);
|
maxf = dmax_kernel_32(n1, x);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = x[0];
|
maxf = x[0];
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -146,7 +135,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (x[i] > maxf) {
|
if (x[i] > maxf) {
|
||||||
maxf = x[i];
|
maxf = x[i];
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,26 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
|
static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT max;
|
FLOAT max;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%2) \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"srlg %%r0,%1,5 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfchdb %%v24,%%v16,%%v17\n\t"
|
"vfchdb %%v24,%%v16,%%v17\n\t"
|
||||||
"vfchdb %%v25,%%v18,%%v19\n\t"
|
"vfchdb %%v25,%%v18,%%v19\n\t"
|
||||||
"vfchdb %%v26,%%v20,%%v21\n\t"
|
"vfchdb %%v26,%%v20,%%v21\n\t"
|
||||||
|
|
@ -55,27 +51,22 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
||||||
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
||||||
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
||||||
|
|
||||||
"vfchdb %%v28,%%v24,%%v25\n\t"
|
"vfchdb %%v28,%%v24,%%v25\n\t"
|
||||||
"vfchdb %%v29,%%v26,%%v27\n\t"
|
"vfchdb %%v29,%%v26,%%v27\n\t"
|
||||||
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
||||||
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
||||||
|
|
||||||
"vfchdb %%v30,%%v28,%%v29\n\t"
|
"vfchdb %%v30,%%v28,%%v29\n\t"
|
||||||
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
||||||
|
|
||||||
"vfchdb %%v31,%%v30,%%v0\n\t"
|
"vfchdb %%v31,%%v30,%%v0\n\t"
|
||||||
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%2) \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,144(%%r1,%2) \n\t"
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%2) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,176(%%r1,%2) \n\t"
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%2) \n\t"
|
"vl %%v21,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,208(%%r1,%2) \n\t"
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%2) \n\t"
|
"vl %%v23,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfchdb %%v24,%%v16,%%v17\n\t"
|
"vfchdb %%v24,%%v16,%%v17\n\t"
|
||||||
"vfchdb %%v25,%%v18,%%v19\n\t"
|
"vfchdb %%v25,%%v18,%%v19\n\t"
|
||||||
"vfchdb %%v26,%%v20,%%v21\n\t"
|
"vfchdb %%v26,%%v20,%%v21\n\t"
|
||||||
|
|
@ -84,29 +75,24 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
||||||
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
||||||
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
||||||
|
|
||||||
"vfchdb %%v28,%%v24,%%v25\n\t"
|
"vfchdb %%v28,%%v24,%%v25\n\t"
|
||||||
"vfchdb %%v29,%%v26,%%v27\n\t"
|
"vfchdb %%v29,%%v26,%%v27\n\t"
|
||||||
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
||||||
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
||||||
|
|
||||||
"vfchdb %%v30,%%v28,%%v29\n\t"
|
"vfchdb %%v30,%%v28,%%v29\n\t"
|
||||||
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
||||||
|
|
||||||
"vfchdb %%v31,%%v30,%%v0\n\t"
|
"vfchdb %%v31,%%v30,%%v0\n\t"
|
||||||
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v16,%%v0,1\n\t"
|
"vrepg %%v16,%%v0,1\n\t"
|
||||||
"wfchdb %%v17,%%v0,%%v16\n\t"
|
"wfchdb %%v17,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
|
||||||
"ldr %0,%%f0 "
|
"ldr %[max],%%f0"
|
||||||
:"=f"(max)
|
: [max] "=f"(max),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return max;
|
return max;
|
||||||
}
|
}
|
||||||
|
|
@ -116,7 +102,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
BLASLONG j = 0;
|
BLASLONG j = 0;
|
||||||
FLOAT maxf = 0.0;
|
FLOAT maxf = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (maxf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -126,9 +113,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
maxf = dmax_kernel_32(n1, x);
|
maxf = dmax_kernel_32(n1, x);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = x[0];
|
maxf = x[0];
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -167,7 +152,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (x[i] > maxf) {
|
if (x[i] > maxf) {
|
||||||
maxf = x[i];
|
maxf = x[i];
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,34 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
|
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT min;
|
FLOAT min;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%2) \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"srlg %%r0,%1,5 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
"vl %%v24,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24,128(%%r1,%2) \n\t"
|
"vl %%v25,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v25,144(%%r1,%2) \n\t"
|
"vl %%v26,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26,160(%%r1,%2) \n\t"
|
"vl %%v27,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v27,176(%%r1,%2) \n\t"
|
"vl %%v28,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,192(%%r1,%2) \n\t"
|
"vl %%v29,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v29,208(%%r1,%2) \n\t"
|
"vl %%v30,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,224(%%r1,%2) \n\t"
|
"vl %%v31,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31,240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmindb %%v16,%%v16,%%v24,0\n\t"
|
"vfmindb %%v16,%%v16,%%v24,0\n\t"
|
||||||
"vfmindb %%v17,%%v17,%%v25,0\n\t"
|
"vfmindb %%v17,%%v17,%%v25,0\n\t"
|
||||||
"vfmindb %%v18,%%v18,%%v26,0\n\t"
|
"vfmindb %%v18,%%v18,%%v26,0\n\t"
|
||||||
|
|
@ -63,29 +59,23 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vfmindb %%v21,%%v21,%%v29,0\n\t"
|
"vfmindb %%v21,%%v21,%%v29,0\n\t"
|
||||||
"vfmindb %%v22,%%v22,%%v30,0\n\t"
|
"vfmindb %%v22,%%v22,%%v30,0\n\t"
|
||||||
"vfmindb %%v23,%%v23,%%v31,0\n\t"
|
"vfmindb %%v23,%%v23,%%v31,0\n\t"
|
||||||
|
|
||||||
"vfmindb %%v16,%%v16,%%v20,0\n\t"
|
"vfmindb %%v16,%%v16,%%v20,0\n\t"
|
||||||
"vfmindb %%v17,%%v17,%%v21,0\n\t"
|
"vfmindb %%v17,%%v17,%%v21,0\n\t"
|
||||||
"vfmindb %%v18,%%v18,%%v22,0\n\t"
|
"vfmindb %%v18,%%v18,%%v22,0\n\t"
|
||||||
"vfmindb %%v19,%%v19,%%v23,0\n\t"
|
"vfmindb %%v19,%%v19,%%v23,0\n\t"
|
||||||
|
|
||||||
"vfmindb %%v16,%%v16,%%v18,0\n\t"
|
"vfmindb %%v16,%%v16,%%v18,0\n\t"
|
||||||
"vfmindb %%v17,%%v17,%%v19,0\n\t"
|
"vfmindb %%v17,%%v17,%%v19,0\n\t"
|
||||||
|
|
||||||
"vfmindb %%v16,%%v16,%%v17,0\n\t"
|
"vfmindb %%v16,%%v16,%%v17,0\n\t"
|
||||||
|
|
||||||
"vfmindb %%v0,%%v0,%%v16,0\n\t"
|
"vfmindb %%v0,%%v0,%%v16,0\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v16,%%v0,1\n\t"
|
"vrepg %%v16,%%v0,1\n\t"
|
||||||
"wfmindb %%v0,%%v0,%%v16,0\n\t"
|
"wfmindb %%v0,%%v0,%%v16,0\n\t"
|
||||||
"ldr %0,%%f0 "
|
"ldr %[min],%%f0"
|
||||||
:"=f"(min)
|
: [min] "=f"(min),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return min;
|
return min;
|
||||||
}
|
}
|
||||||
|
|
@ -95,7 +85,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
BLASLONG j = 0;
|
BLASLONG j = 0;
|
||||||
FLOAT minf = 0.0;
|
FLOAT minf = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (minf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (minf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -105,9 +96,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
minf = dmin_kernel_32(n1, x);
|
minf = dmin_kernel_32(n1, x);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = x[0];
|
minf = x[0];
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -146,7 +135,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (x[i] < minf) {
|
if (x[i] < minf) {
|
||||||
minf = x[i];
|
minf = x[i];
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,26 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
|
static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT min;
|
FLOAT min;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%2) \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"srlg %%r0,%1,5 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfchdb %%v24,%%v17,%%v16\n\t"
|
"vfchdb %%v24,%%v17,%%v16\n\t"
|
||||||
"vfchdb %%v25,%%v19,%%v18\n\t"
|
"vfchdb %%v25,%%v19,%%v18\n\t"
|
||||||
"vfchdb %%v26,%%v21,%%v20\n\t"
|
"vfchdb %%v26,%%v21,%%v20\n\t"
|
||||||
|
|
@ -55,27 +51,22 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
||||||
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
||||||
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
||||||
|
|
||||||
"vfchdb %%v28,%%v25,%%v24\n\t"
|
"vfchdb %%v28,%%v25,%%v24\n\t"
|
||||||
"vfchdb %%v29,%%v27,%%v26\n\t"
|
"vfchdb %%v29,%%v27,%%v26\n\t"
|
||||||
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
||||||
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
||||||
|
|
||||||
"vfchdb %%v30,%%v29,%%v28\n\t"
|
"vfchdb %%v30,%%v29,%%v28\n\t"
|
||||||
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
||||||
|
|
||||||
"vfchdb %%v31,%%v0,%%v30\n\t"
|
"vfchdb %%v31,%%v0,%%v30\n\t"
|
||||||
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%2) \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,144(%%r1,%2) \n\t"
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%2) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,176(%%r1,%2) \n\t"
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%2) \n\t"
|
"vl %%v21,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,208(%%r1,%2) \n\t"
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%2) \n\t"
|
"vl %%v23,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfchdb %%v24,%%v17,%%v16\n\t"
|
"vfchdb %%v24,%%v17,%%v16\n\t"
|
||||||
"vfchdb %%v25,%%v19,%%v18\n\t"
|
"vfchdb %%v25,%%v19,%%v18\n\t"
|
||||||
"vfchdb %%v26,%%v21,%%v20\n\t"
|
"vfchdb %%v26,%%v21,%%v20\n\t"
|
||||||
|
|
@ -84,29 +75,24 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
|
||||||
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
||||||
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
"vsel %%v26,%%v20,%%v21,%%v26\n\t"
|
||||||
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
"vsel %%v27,%%v22,%%v23,%%v27\n\t"
|
||||||
|
|
||||||
"vfchdb %%v28,%%v25,%%v24\n\t"
|
"vfchdb %%v28,%%v25,%%v24\n\t"
|
||||||
"vfchdb %%v29,%%v27,%%v26\n\t"
|
"vfchdb %%v29,%%v27,%%v26\n\t"
|
||||||
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
"vsel %%v28,%%v24,%%v25,%%v28\n\t"
|
||||||
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
"vsel %%v29,%%v26,%%v27,%%v29\n\t"
|
||||||
|
|
||||||
"vfchdb %%v30,%%v29,%%v28\n\t"
|
"vfchdb %%v30,%%v29,%%v28\n\t"
|
||||||
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
"vsel %%v30,%%v28,%%v29,%%v30\n\t"
|
||||||
|
|
||||||
"vfchdb %%v31,%%v0,%%v30\n\t"
|
"vfchdb %%v31,%%v0,%%v30\n\t"
|
||||||
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
"vsel %%v0,%%v30,%%v0,%%v31\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v16,%%v0,1\n\t"
|
"vrepg %%v16,%%v0,1\n\t"
|
||||||
"wfchdb %%v17,%%v16,%%v0\n\t"
|
"wfchdb %%v17,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
|
||||||
"ldr %0,%%f0 "
|
"ldr %[min],%%f0"
|
||||||
:"=f"(min)
|
: [min] "=f"(min),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return min;
|
return min;
|
||||||
}
|
}
|
||||||
|
|
@ -116,7 +102,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
BLASLONG j = 0;
|
BLASLONG j = 0;
|
||||||
FLOAT minf = 0.0;
|
FLOAT minf = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (minf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (minf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -126,9 +113,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
minf = dmin_kernel_32(n1, x);
|
minf = dmin_kernel_32(n1, x);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = x[0];
|
minf = x[0];
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -167,7 +152,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (x[i] < minf) {
|
if (x[i] < minf) {
|
||||||
minf = x[i];
|
minf = x[i];
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,25 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
|
||||||
{
|
__asm__("vlrepg %%v0,%[c]\n\t"
|
||||||
__asm__ (
|
"vlrepg %%v1,%[s]\n\t"
|
||||||
"vlrepg %%v0,%3 \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"vlrepg %%v1,%4 \n\t"
|
|
||||||
"srlg %%r0,%0,5 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
"vl %%v24, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
"vl %%v25, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
"vl %%v26, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
"vl %%v27, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
"vl %%v16, 0(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
"vl %%v17, 16(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
"vl %%v18, 32(%%r1,%[y])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
"vl %%v19, 48(%%r1,%[y])\n\t"
|
||||||
|
|
||||||
"vfmdb %%v28,%%v24,%%v0\n\t"
|
"vfmdb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmdb %%v29,%%v25,%%v0\n\t"
|
"vfmdb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -63,25 +60,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 0(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
"vst %%v29, 16(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
"vst %%v30, 32(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
"vst %%v31, 48(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
"vst %%v20, 0(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
"vst %%v21, 16(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
"vst %%v22, 32(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
"vst %%v23, 48(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
"vl %%v24, 64(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
"vl %%v26, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
"vl %%v27, 112(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
"vl %%v16, 64(%%r1,%[y])\n\t"
|
||||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
"vl %%v17, 80(%%r1,%[y])\n\t"
|
||||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
"vl %%v18, 96(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
"vl %%v19, 112(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
|
||||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmdb %%v28,%%v24,%%v0\n\t"
|
"vfmdb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmdb %%v29,%%v25,%%v0\n\t"
|
"vfmdb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -99,25 +93,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 64(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
"vst %%v29, 80(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
"vst %%v30, 96(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
"vst %%v31, 112(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
"vst %%v20, 64(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
"vst %%v21, 80(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
"vst %%v22, 96(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
"vst %%v23, 112(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
"vl %%v24, 128(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
"vl %%v26, 160(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
"vl %%v27, 176(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
"vl %%v16, 128(%%r1,%[y])\n\t"
|
||||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
"vl %%v17, 144(%%r1,%[y])\n\t"
|
||||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
"vl %%v18, 160(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
"vl %%v19, 176(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
|
||||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmdb %%v28,%%v24,%%v0\n\t"
|
"vfmdb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmdb %%v29,%%v25,%%v0\n\t"
|
"vfmdb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -135,25 +126,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 128(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
"vst %%v29, 144(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
"vst %%v30, 160(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
"vst %%v31, 176(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
"vst %%v20, 128(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
"vst %%v21, 144(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
"vst %%v22, 160(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
"vst %%v23, 176(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
"vl %%v24, 192(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25, 208(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
"vl %%v26, 224(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
"vl %%v27, 240(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
"vl %%v16, 192(%%r1,%[y])\n\t"
|
||||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
"vl %%v17, 208(%%r1,%[y])\n\t"
|
||||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
"vl %%v18, 224(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
"vl %%v19, 240(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
|
||||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmdb %%v28,%%v24,%%v0\n\t"
|
"vfmdb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmdb %%v29,%%v25,%%v0\n\t"
|
"vfmdb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -171,39 +159,38 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 192(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
"vst %%v29, 208(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
"vst %%v30, 224(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
"vst %%v31, 240(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
"vst %%v20, 192(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
"vst %%v21, 208(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
"vst %%v22, 224(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
"vst %%v23, 240(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
|
||||||
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
|
[n] "+&r"(n)
|
||||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||||
{
|
FLOAT c, FLOAT s) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
|
|
||||||
FLOAT temp;
|
FLOAT temp;
|
||||||
|
|
||||||
if ( n <= 0 ) return(0);
|
if (n <= 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1) )
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -32;
|
BLASLONG n1 = n & -32;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
FLOAT cosa, sina;
|
FLOAT cosa, sina;
|
||||||
cosa = c;
|
cosa = c;
|
||||||
sina = s;
|
sina = s;
|
||||||
|
|
@ -211,8 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
temp = c * x[i] + s * y[i];
|
temp = c * x[i] + s * y[i];
|
||||||
y[i] = c * y[i] - s * x[i];
|
y[i] = c * y[i] - s * x[i];
|
||||||
x[i] = temp;
|
x[i] = temp;
|
||||||
|
|
@ -221,13 +207,9 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
while (i < n) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
while(i < n)
|
|
||||||
{
|
|
||||||
temp = c * x[ix] + s * y[iy];
|
temp = c * x[ix] + s * y[iy];
|
||||||
y[iy] = c * y[iy] - s * x[ix];
|
y[iy] = c * y[iy] - s * x[ix];
|
||||||
x[ix] = temp;
|
x[ix] = temp;
|
||||||
|
|
@ -242,5 +224,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,128 +27,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x)
|
static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) {
|
||||||
{
|
__asm__("vlrepg %%v0,%[da]\n\t"
|
||||||
__asm__ volatile (
|
"srlg %[n],%[n],4\n\t"
|
||||||
"vlrepg %%v0,%1 \n\t"
|
|
||||||
"srlg %%r0,%0,4 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 0(%%r1,%2) \n\t"
|
"vl %%v24,0(%%r1,%[x])\n\t"
|
||||||
"vfmdb %%v24,%%v24,%%v0\n\t"
|
"vfmdb %%v24,%%v24,%%v0\n\t"
|
||||||
"vst %%v24, 0(%%r1,%2) \n\t"
|
"vst %%v24,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 16(%%r1,%2) \n\t"
|
"vl %%v25,16(%%r1,%[x])\n\t"
|
||||||
"vfmdb %%v25,%%v25,%%v0\n\t"
|
"vfmdb %%v25,%%v25,%%v0\n\t"
|
||||||
"vst %%v25, 16(%%r1,%2) \n\t"
|
"vst %%v25,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 32(%%r1,%2) \n\t"
|
"vl %%v26,32(%%r1,%[x])\n\t"
|
||||||
"vfmdb %%v26,%%v26,%%v0\n\t"
|
"vfmdb %%v26,%%v26,%%v0\n\t"
|
||||||
"vst %%v26, 32(%%r1,%2) \n\t"
|
"vst %%v26,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v27, 48(%%r1,%2) \n\t"
|
"vl %%v27,48(%%r1,%[x])\n\t"
|
||||||
"vfmdb %%v27,%%v27,%%v0\n\t"
|
"vfmdb %%v27,%%v27,%%v0\n\t"
|
||||||
"vst %%v27, 48(%%r1,%2) \n\t"
|
"vst %%v27,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 64(%%r1,%2) \n\t"
|
"vl %%v28,64(%%r1,%[x])\n\t"
|
||||||
"vfmdb %%v24,%%v24,%%v0 \n\t"
|
"vfmdb %%v28,%%v28,%%v0\n\t"
|
||||||
"vst %%v24, 64(%%r1,%2) \n\t"
|
"vst %%v28,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 80(%%r1,%2) \n\t"
|
"vl %%v29,80(%%r1,%[x])\n\t"
|
||||||
"vfmdb %%v25,%%v25,%%v0 \n\t"
|
"vfmdb %%v29,%%v29,%%v0\n\t"
|
||||||
"vst %%v25, 80(%%r1,%2) \n\t"
|
"vst %%v29,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 96(%%r1,%2) \n\t"
|
"vl %%v30,96(%%r1,%[x])\n\t"
|
||||||
"vfmdb %%v26,%%v26,%%v0 \n\t"
|
"vfmdb %%v30,%%v30,%%v0\n\t"
|
||||||
"vst %%v26, 96(%%r1,%2) \n\t"
|
"vst %%v30,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v27, 112(%%r1,%2) \n\t"
|
"vl %%v31,112(%%r1,%[x])\n\t"
|
||||||
"vfmdb %%v27,%%v27,%%v0 \n\t"
|
"vfmdb %%v31,%%v31,%%v0\n\t"
|
||||||
"vst %%v27, 112(%%r1,%2) \n\t"
|
"vst %%v31,112(%%r1,%[x])\n\t"
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
|
||||||
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
|
: [x] "a"(x),[da] "Q"(da)
|
||||||
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
|
: "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
);
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x)
|
static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
|
||||||
{
|
__asm__("vzero %%v0\n\t"
|
||||||
__asm__ volatile(
|
"srlg %[n],%[n],4\n\t"
|
||||||
"vzero %%v24 \n\t"
|
|
||||||
"vzero %%v25 \n\t"
|
|
||||||
"vzero %%v26 \n\t"
|
|
||||||
"vzero %%v27 \n\t"
|
|
||||||
"srlg %%r0,%0,4 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vst %%v0,0(%%r1,%[x])\n\t"
|
||||||
"vst %%v24,0(%%r1,%1) \n\t"
|
"vst %%v0,16(%%r1,%[x])\n\t"
|
||||||
"vst %%v25,16(%%r1,%1) \n\t"
|
"vst %%v0,32(%%r1,%[x])\n\t"
|
||||||
"vst %%v26,32(%%r1,%1) \n\t"
|
"vst %%v0,48(%%r1,%[x])\n\t"
|
||||||
"vst %%v27,48(%%r1,%1) \n\t"
|
"vst %%v0,64(%%r1,%[x])\n\t"
|
||||||
"vst %%v24,64(%%r1,%1) \n\t"
|
"vst %%v0,80(%%r1,%[x])\n\t"
|
||||||
"vst %%v25,80(%%r1,%1) \n\t"
|
"vst %%v0,96(%%r1,%[x])\n\t"
|
||||||
"vst %%v26,96(%%r1,%1) \n\t"
|
"vst %%v0,112(%%r1,%[x])\n\t"
|
||||||
"vst %%v27,112(%%r1,%1) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((FLOAT (*)[n])x)
|
: [x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v24","v25","v26","v27"
|
: "cc", "r1", "v0");
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
|
||||||
{
|
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||||
|
BLASLONG dummy2) {
|
||||||
BLASLONG i = 0, j = 0;
|
BLASLONG i = 0, j = 0;
|
||||||
if (n <= 0 || inc_x <= 0)
|
if (n <= 0 || inc_x <= 0)
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
if (inc_x == 1) {
|
||||||
|
|
||||||
if ( inc_x == 1 )
|
if (da == 0.0) {
|
||||||
{
|
|
||||||
|
|
||||||
if ( da == 0.0 )
|
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -16;
|
BLASLONG n1 = n & -16;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
|
|
||||||
dscal_kernel_16_zero(n1, x);
|
dscal_kernel_16_zero(n1, x);
|
||||||
j = n1;
|
j = n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(j < n)
|
while (j < n) {
|
||||||
{
|
|
||||||
|
|
||||||
x[j] = 0.0;
|
x[j] = 0.0;
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -16;
|
BLASLONG n1 = n & -16;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
dscal_kernel_16(n1, da, x);
|
dscal_kernel_16(n1, da, x);
|
||||||
j = n1;
|
j = n1;
|
||||||
}
|
}
|
||||||
while(j < n)
|
while (j < n) {
|
||||||
{
|
|
||||||
|
|
||||||
x[j] = da * x[j];
|
x[j] = da * x[j];
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
if (da == 0.0) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
if ( da == 0.0 )
|
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -4;
|
BLASLONG n1 = n & -4;
|
||||||
|
|
||||||
|
|
@ -163,17 +141,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
j += 4;
|
j += 4;
|
||||||
|
|
||||||
}
|
}
|
||||||
while(j < n)
|
while (j < n) {
|
||||||
{
|
|
||||||
|
|
||||||
x[i] = 0.0;
|
x[i] = 0.0;
|
||||||
i += inc_x;
|
i += inc_x;
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
BLASLONG n1 = n & -4;
|
BLASLONG n1 = n & -4;
|
||||||
|
|
||||||
while (j < n1) {
|
while (j < n1) {
|
||||||
|
|
@ -188,8 +163,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while(j < n)
|
while (j < n) {
|
||||||
{
|
|
||||||
|
|
||||||
x[i] = da * x[i];
|
x[i] = da * x[i];
|
||||||
i += inc_x;
|
i += inc_x;
|
||||||
|
|
@ -201,5 +175,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018,The OpenBLAS Project
|
Copyright (c) 2013-2019,The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms,with or without
|
Redistribution and use in source and binary forms,with or without
|
||||||
modification,are permitted provided that the following conditions are
|
modification,are permitted provided that the following conditions are
|
||||||
|
|
@ -27,35 +27,38 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||||
{
|
|
||||||
double dot;
|
double dot;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vzero %%v0\n\t"
|
||||||
"vzero %%v0 \n\t"
|
"vzero %%v1\n\t"
|
||||||
"srlg %%r0,%1,4 \n\t"
|
"vzero %%v2\n\t"
|
||||||
|
"vzero %%v3\n\t"
|
||||||
|
"vzero %%v4\n\t"
|
||||||
|
"vzero %%v5\n\t"
|
||||||
|
"vzero %%v6\n\t"
|
||||||
|
"vzero %%v7\n\t"
|
||||||
|
"srlg %[n],%[n],4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[x])\n\t"
|
||||||
"pfd 1,1024(%%r1,%3) \n\t"
|
"pfd 1,1024(%%r1,%[y])\n\t"
|
||||||
|
"vlef %%v16,0(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v16,0(%%r1,%2),0 \n\t"
|
"vlef %%v16,4(%%r1,%[x]),2\n\t"
|
||||||
"vlef %%v16,4(%%r1,%2),2 \n\t"
|
"vlef %%v17,8(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v17,8(%%r1,%2),0 \n\t"
|
"vlef %%v17,12(%%r1,%[x]),2\n\t"
|
||||||
"vlef %%v17,12(%%r1,%2),2 \n\t"
|
"vlef %%v18,16(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v18,16(%%r1,%2),0 \n\t"
|
"vlef %%v18,20(%%r1,%[x]),2\n\t"
|
||||||
"vlef %%v18,20(%%r1,%2),2 \n\t"
|
"vlef %%v19,24(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v19,24(%%r1,%2),0 \n\t"
|
"vlef %%v19,28(%%r1,%[x]),2\n\t"
|
||||||
"vlef %%v19,28(%%r1,%2),2 \n\t"
|
"vlef %%v20,32(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v20,32(%%r1,%2),0 \n\t"
|
"vlef %%v20,36(%%r1,%[x]),2\n\t"
|
||||||
"vlef %%v20,36(%%r1,%2),2 \n\t"
|
"vlef %%v21,40(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v21,40(%%r1,%2),0 \n\t"
|
"vlef %%v21,44(%%r1,%[x]),2\n\t"
|
||||||
"vlef %%v21,44(%%r1,%2),2 \n\t"
|
"vlef %%v22,48(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v22,48(%%r1,%2),0 \n\t"
|
"vlef %%v22,52(%%r1,%[x]),2\n\t"
|
||||||
"vlef %%v22,52(%%r1,%2),2 \n\t"
|
"vlef %%v23,56(%%r1,%[x]),0\n\t"
|
||||||
"vlef %%v23,56(%%r1,%2),0 \n\t"
|
"vlef %%v23,60(%%r1,%[x]),2\n\t"
|
||||||
"vlef %%v23,60(%%r1,%2),2 \n\t"
|
|
||||||
|
|
||||||
"vflls %%v16,%%v16\n\t"
|
"vflls %%v16,%%v16\n\t"
|
||||||
"vflls %%v17,%%v17\n\t"
|
"vflls %%v17,%%v17\n\t"
|
||||||
"vflls %%v18,%%v18\n\t"
|
"vflls %%v18,%%v18\n\t"
|
||||||
|
|
@ -64,64 +67,70 @@ static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||||
"vflls %%v21,%%v21\n\t"
|
"vflls %%v21,%%v21\n\t"
|
||||||
"vflls %%v22,%%v22\n\t"
|
"vflls %%v22,%%v22\n\t"
|
||||||
"vflls %%v23,%%v23\n\t"
|
"vflls %%v23,%%v23\n\t"
|
||||||
|
"vlef %%v24,0(%%r1,%[y]),0\n\t"
|
||||||
"vlef %%v24,0(%%r1,%3),0 \n\t"
|
"vlef %%v24,4(%%r1,%[y]),2\n\t"
|
||||||
"vlef %%v24,4(%%r1,%3),2 \n\t"
|
|
||||||
"vflls %%v24,%%v24\n\t"
|
"vflls %%v24,%%v24\n\t"
|
||||||
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
|
"vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
|
||||||
"vlef %%v25,8(%%r1,%3),0 \n\t"
|
"vlef %%v25,8(%%r1,%[y]),0\n\t"
|
||||||
"vlef %%v25,12(%%r1,%3),2 \n\t"
|
"vlef %%v25,12(%%r1,%[y]),2\n\t"
|
||||||
"vflls %%v25,%%v25\n\t"
|
"vflls %%v25,%%v25\n\t"
|
||||||
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
|
"vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
|
||||||
"vlef %%v26,16(%%r1,%3),0 \n\t"
|
"vlef %%v26,16(%%r1,%[y]),0\n\t"
|
||||||
"vlef %%v26,20(%%r1,%3),2 \n\t"
|
"vlef %%v26,20(%%r1,%[y]),2\n\t"
|
||||||
"vflls %%v26,%%v26\n\t"
|
"vflls %%v26,%%v26\n\t"
|
||||||
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
|
"vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
|
||||||
"vlef %%v27,24(%%r1,%3),0 \n\t"
|
"vlef %%v27,24(%%r1,%[y]),0\n\t"
|
||||||
"vlef %%v27,28(%%r1,%3),2 \n\t"
|
"vlef %%v27,28(%%r1,%[y]),2\n\t"
|
||||||
"vflls %%v27,%%v27\n\t"
|
"vflls %%v27,%%v27\n\t"
|
||||||
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
|
"vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
|
||||||
"vlef %%v28,32(%%r1,%3),0 \n\t"
|
"vlef %%v28,32(%%r1,%[y]),0\n\t"
|
||||||
"vlef %%v28,36(%%r1,%3),2 \n\t"
|
"vlef %%v28,36(%%r1,%[y]),2\n\t"
|
||||||
"vflls %%v28,%%v28\n\t"
|
"vflls %%v28,%%v28\n\t"
|
||||||
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
|
"vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
|
||||||
"vlef %%v29,40(%%r1,%3),0 \n\t"
|
"vlef %%v29,40(%%r1,%[y]),0\n\t"
|
||||||
"vlef %%v29,44(%%r1,%3),2 \n\t"
|
"vlef %%v29,44(%%r1,%[y]),2\n\t"
|
||||||
"vflls %%v29,%%v29\n\t"
|
"vflls %%v29,%%v29\n\t"
|
||||||
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
|
"vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
|
||||||
"vlef %%v30,48(%%r1,%3),0 \n\t"
|
"vlef %%v30,48(%%r1,%[y]),0\n\t"
|
||||||
"vlef %%v30,52(%%r1,%3),2 \n\t"
|
"vlef %%v30,52(%%r1,%[y]),2\n\t"
|
||||||
"vflls %%v30,%%v30\n\t"
|
"vflls %%v30,%%v30\n\t"
|
||||||
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
|
"vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
|
||||||
"vlef %%v31,56(%%r1,%3),0 \n\t"
|
"vlef %%v31,56(%%r1,%[y]),0\n\t"
|
||||||
"vlef %%v31,60(%%r1,%3),2 \n\t"
|
"vlef %%v31,60(%%r1,%[y]),2\n\t"
|
||||||
"vflls %%v31,%%v31\n\t"
|
"vflls %%v31,%%v31\n\t"
|
||||||
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
|
"vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
|
||||||
|
|
||||||
"agfi %%r1,64\n\t"
|
"agfi %%r1,64\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v1\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v2\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v3\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v4\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v5\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v6\n\t"
|
||||||
|
"vfadb %%v0,%%v0,%%v7\n\t"
|
||||||
"vrepg %%v1,%%v0,1\n\t"
|
"vrepg %%v1,%%v0,1\n\t"
|
||||||
"adbr %%f0,%%f1\n\t"
|
"adbr %%f0,%%f1\n\t"
|
||||||
"ldr %0,%%f0 "
|
"ldr %[dot],%%f0"
|
||||||
:"=f"(dot)
|
: [dot] "=f"(dot),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return dot;
|
return dot;
|
||||||
}
|
}
|
||||||
|
|
||||||
double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||||
{
|
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
|
|
||||||
double dot = 0.0;
|
double dot = 0.0;
|
||||||
|
|
||||||
if ( n <= 0 ) return(dot);
|
if (n <= 0)
|
||||||
|
return (dot);
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1) )
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -16;
|
BLASLONG n1 = n & -16;
|
||||||
|
|
||||||
|
|
@ -129,8 +138,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
||||||
dot = dsdot_kernel_16(n1, x, y);
|
dot = dsdot_kernel_16(n1, x, y);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
dot += (double) y[i] * (double) x[i];
|
dot += (double) y[i] * (double) x[i];
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -138,13 +146,11 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
||||||
}
|
}
|
||||||
return (dot);
|
return (dot);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BLASLONG n1 = n & -2;
|
BLASLONG n1 = n & -2;
|
||||||
|
|
||||||
while(i < n1)
|
while (i < n1) {
|
||||||
{
|
|
||||||
|
|
||||||
dot += (double) y[iy] * (double) x[ix];
|
dot += (double) y[iy] * (double) x[ix];
|
||||||
dot += (double) y[iy + inc_y] * (double) x[ix + inc_x];
|
dot += (double) y[iy + inc_y] * (double) x[ix + inc_x];
|
||||||
|
|
@ -154,8 +160,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
dot += (double) y[iy] * (double) x[ix];
|
dot += (double) y[iy] * (double) x[ix];
|
||||||
ix += inc_x;
|
ix += inc_x;
|
||||||
|
|
@ -166,5 +171,3 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
||||||
return (dot);
|
return (dot);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,111 +27,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||||
{
|
__asm__("srlg %[n],%[n],5\n\t"
|
||||||
__asm__ volatile(
|
|
||||||
"srlg %%r0,%0,5 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
"vl %%v24, 128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
"vl %%v25, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
"vl %%v26, 160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
"vl %%v27, 176(%%r1,%[x])\n\t"
|
||||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
"vl %%v28, 192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
"vl %%v29, 208(%%r1,%[x])\n\t"
|
||||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
"vl %%v30, 224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
"vl %%v31, 240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
"vl %%v0, 0(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v1, 16(%%r1,%[y])\n\t"
|
||||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
"vl %%v2, 32(%%r1,%[y])\n\t"
|
||||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
"vl %%v3, 48(%%r1,%[y])\n\t"
|
||||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
"vl %%v4, 64(%%r1,%[y])\n\t"
|
||||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
"vl %%v5, 80(%%r1,%[y])\n\t"
|
||||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
"vl %%v6, 96(%%r1,%[y])\n\t"
|
||||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
"vl %%v7, 112(%%r1,%[y])\n\t"
|
||||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
"vst %%v0, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
"vst %%v1, 16(%%r1,%[x])\n\t"
|
||||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
"vst %%v2, 32(%%r1,%[x])\n\t"
|
||||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
"vst %%v3, 48(%%r1,%[x])\n\t"
|
||||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
"vst %%v4, 64(%%r1,%[x])\n\t"
|
||||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
"vst %%v5, 80(%%r1,%[x])\n\t"
|
||||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
"vst %%v6, 96(%%r1,%[x])\n\t"
|
||||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
"vst %%v7, 112(%%r1,%[x])\n\t"
|
||||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
"vl %%v0, 128(%%r1,%[y])\n\t"
|
||||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
"vl %%v1, 144(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v2, 160(%%r1,%[y])\n\t"
|
||||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
"vl %%v3, 176(%%r1,%[y])\n\t"
|
||||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
"vl %%v4, 192(%%r1,%[y])\n\t"
|
||||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
"vl %%v5, 208(%%r1,%[y])\n\t"
|
||||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
"vl %%v6, 224(%%r1,%[y])\n\t"
|
||||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
"vl %%v7, 240(%%r1,%[y])\n\t"
|
||||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
"vst %%v0, 128(%%r1,%[x])\n\t"
|
||||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
"vst %%v1, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
"vst %%v2, 160(%%r1,%[x])\n\t"
|
||||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
"vst %%v3, 176(%%r1,%[x])\n\t"
|
||||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
"vst %%v4, 192(%%r1,%[x])\n\t"
|
||||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
"vst %%v5, 208(%%r1,%[x])\n\t"
|
||||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
"vst %%v6, 224(%%r1,%[x])\n\t"
|
||||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
"vst %%v7, 240(%%r1,%[x])\n\t"
|
||||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
"vst %%v16, 0(%%r1,%[y])\n\t"
|
||||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
"vst %%v17, 16(%%r1,%[y])\n\t"
|
||||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
"vst %%v18, 32(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v19, 48(%%r1,%[y])\n\t"
|
||||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
"vst %%v20, 64(%%r1,%[y])\n\t"
|
||||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
"vst %%v21, 80(%%r1,%[y])\n\t"
|
||||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
"vst %%v22, 96(%%r1,%[y])\n\t"
|
||||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
"vst %%v23, 112(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
"vst %%v24, 128(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
"vst %%v25, 144(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
"vst %%v26, 160(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
"vst %%v27, 176(%%r1,%[y])\n\t"
|
||||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
"vst %%v28, 192(%%r1,%[y])\n\t"
|
||||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
"vst %%v29, 208(%%r1,%[y])\n\t"
|
||||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
"vst %%v30, 224(%%r1,%[y])\n\t"
|
||||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
"vst %%v31, 240(%%r1,%[y])\n\t"
|
||||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
|
||||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
|
||||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
|
||||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
|
||||||
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
|
[n] "+&r"(n)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: [x] "a"(x),[y] "a"(y)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
||||||
{
|
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||||
|
BLASLONG dummy2) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
FLOAT temp;
|
FLOAT temp;
|
||||||
|
|
||||||
if ( n <= 0 ) return(0);
|
if (n <= 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -32;
|
BLASLONG n1 = n & -32;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
dswap_kernel_32(n1, x, y);
|
dswap_kernel_32(n1, x, y);
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
temp = y[i];
|
temp = y[i];
|
||||||
y[i] = x[i];
|
y[i] = x[i];
|
||||||
x[i] = temp;
|
x[i] = temp;
|
||||||
|
|
@ -139,13 +133,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
while (i < n) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
while(i < n)
|
|
||||||
{
|
|
||||||
temp = y[iy];
|
temp = y[iy];
|
||||||
y[iy] = x[ix];
|
y[iy] = x[ix];
|
||||||
x[ix] = temp;
|
x[ix] = temp;
|
||||||
|
|
@ -158,5 +148,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
||||||
}
|
}
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2017, The OpenBLAS Project
|
Copyright (c) 2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,26 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
|
||||||
|
|
||||||
static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
|
||||||
{
|
|
||||||
BLASLONG iamax;
|
BLASLONG iamax;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vlef %%v0,0(%[x]),0\n\t"
|
||||||
"vlef %%v0,0(%3),0 \n\t"
|
"vlef %%v1,4(%[x]),0\n\t"
|
||||||
"vlef %%v1,4(%3),0 \n\t"
|
"vlef %%v0,8(%[x]),1\n\t"
|
||||||
"vlef %%v0,8(%3),1 \n\t"
|
"vlef %%v1,12(%[x]),1\n\t"
|
||||||
"vlef %%v1,12(%3),1 \n\t"
|
"vlef %%v0,16(%[x]),2\n\t"
|
||||||
"vlef %%v0,16(%3),2 \n\t"
|
"vlef %%v1,20(%[x]),2\n\t"
|
||||||
"vlef %%v1,20(%3),2 \n\t"
|
"vlef %%v0,24(%[x]),3\n\t"
|
||||||
"vlef %%v0,24(%3),3 \n\t"
|
"vlef %%v1,28(%[x]),3\n\t"
|
||||||
"vlef %%v1,28(%3),3 \n\t"
|
|
||||||
"vflpsb %%v0,%%v0\n\t"
|
"vflpsb %%v0,%%v0\n\t"
|
||||||
"vflpsb %%v1,%%v1\n\t"
|
"vflpsb %%v1,%%v1\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v1\n\t"
|
"vfasb %%v0,%%v0,%%v1\n\t"
|
||||||
|
|
@ -89,31 +82,26 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vleif %%v27,13,1\n\t"
|
"vleif %%v27,13,1\n\t"
|
||||||
"vleif %%v27,14,2\n\t"
|
"vleif %%v27,14,2\n\t"
|
||||||
"vleif %%v27,15,3\n\t"
|
"vleif %%v27,15,3\n\t"
|
||||||
"srlg %%r0,%2,5 \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%3) \n\t"
|
"vl %%v28,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,16(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v17,%%v16,%%v28\n\t"
|
"vpkg %%v17,%%v16,%%v28\n\t"
|
||||||
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
|
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
|
||||||
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%3) \n\t"
|
"vl %%v29,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v29,48(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v19,%%v18,%%v29\n\t"
|
"vpkg %%v19,%%v18,%%v29\n\t"
|
||||||
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
|
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
|
||||||
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%3) \n\t"
|
"vl %%v30,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,80(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v21,%%v20,%%v30\n\t"
|
"vpkg %%v21,%%v20,%%v30\n\t"
|
||||||
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
|
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
|
||||||
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%3) \n\t"
|
"vl %%v31,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v31,112(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v23,%%v22,%%v31\n\t"
|
"vpkg %%v23,%%v22,%%v31\n\t"
|
||||||
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
|
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
|
||||||
|
|
||||||
"vflpsb %%v16, %%v16\n\t"
|
"vflpsb %%v16, %%v16\n\t"
|
||||||
"vflpsb %%v17, %%v17\n\t"
|
"vflpsb %%v17, %%v17\n\t"
|
||||||
"vflpsb %%v18, %%v18\n\t"
|
"vflpsb %%v18, %%v18\n\t"
|
||||||
|
|
@ -126,14 +114,12 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vfasb %%v17,%%v18,%%v19\n\t"
|
"vfasb %%v17,%%v18,%%v19\n\t"
|
||||||
"vfasb %%v18,%%v20,%%v21\n\t"
|
"vfasb %%v18,%%v20,%%v21\n\t"
|
||||||
"vfasb %%v19,%%v22,%%v23\n\t"
|
"vfasb %%v19,%%v22,%%v23\n\t"
|
||||||
|
|
||||||
"vfchesb %%v5,%%v16,%%v17\n\t"
|
"vfchesb %%v5,%%v16,%%v17\n\t"
|
||||||
"vfchesb %%v6,%%v18,%%v19\n\t"
|
"vfchesb %%v6,%%v18,%%v19\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
|
||||||
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
|
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
|
||||||
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
|
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
|
||||||
|
|
||||||
"vfchesb %%v18,%%v16,%%v17\n\t"
|
"vfchesb %%v18,%%v16,%%v17\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
||||||
|
|
@ -141,7 +127,6 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vesrlg %%v5,%%v5,32\n\t"
|
"vesrlg %%v5,%%v5,32\n\t"
|
||||||
"vag %%v5,%%v5,%%v4\n\t"
|
"vag %%v5,%%v5,%%v4\n\t"
|
||||||
"vag %%v6,%%v6,%%v4\n\t"
|
"vag %%v6,%%v6,%%v4\n\t"
|
||||||
|
|
||||||
"vfchesb %%v7,%%v0,%%v16\n\t"
|
"vfchesb %%v7,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
||||||
"vsegf %%v8,%%v7\n\t"
|
"vsegf %%v8,%%v7\n\t"
|
||||||
|
|
@ -150,27 +135,22 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
||||||
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%3) \n\t"
|
"vl %%v28,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,144(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v17,%%v16,%%v28\n\t"
|
"vpkg %%v17,%%v16,%%v28\n\t"
|
||||||
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
|
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
|
||||||
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%3) \n\t"
|
"vl %%v29,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v29,176(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v19,%%v18,%%v29\n\t"
|
"vpkg %%v19,%%v18,%%v29\n\t"
|
||||||
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
|
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
|
||||||
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%3) \n\t"
|
"vl %%v30,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,208(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v21,%%v20,%%v30\n\t"
|
"vpkg %%v21,%%v20,%%v30\n\t"
|
||||||
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
|
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
|
||||||
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%3) \n\t"
|
"vl %%v31,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31,240(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v23,%%v22,%%v31\n\t"
|
"vpkg %%v23,%%v22,%%v31\n\t"
|
||||||
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
|
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
|
||||||
|
|
||||||
"vflpsb %%v16, %%v16\n\t"
|
"vflpsb %%v16, %%v16\n\t"
|
||||||
"vflpsb %%v17, %%v17\n\t"
|
"vflpsb %%v17, %%v17\n\t"
|
||||||
"vflpsb %%v18, %%v18\n\t"
|
"vflpsb %%v18, %%v18\n\t"
|
||||||
|
|
@ -183,14 +163,12 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vfasb %%v17,%%v18,%%v19\n\t"
|
"vfasb %%v17,%%v18,%%v19\n\t"
|
||||||
"vfasb %%v18,%%v20,%%v21\n\t"
|
"vfasb %%v18,%%v20,%%v21\n\t"
|
||||||
"vfasb %%v19,%%v22,%%v23\n\t"
|
"vfasb %%v19,%%v22,%%v23\n\t"
|
||||||
|
|
||||||
"vfchesb %%v5,%%v16,%%v17\n\t"
|
"vfchesb %%v5,%%v16,%%v17\n\t"
|
||||||
"vfchesb %%v6,%%v18,%%v19\n\t"
|
"vfchesb %%v6,%%v18,%%v19\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
|
||||||
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
|
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
|
||||||
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
|
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
|
||||||
|
|
||||||
"vfchesb %%v18,%%v16,%%v17\n\t"
|
"vfchesb %%v18,%%v16,%%v17\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
||||||
|
|
@ -198,7 +176,6 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vesrlg %%v5,%%v5,32\n\t"
|
"vesrlg %%v5,%%v5,32\n\t"
|
||||||
"vag %%v5,%%v5,%%v4\n\t"
|
"vag %%v5,%%v5,%%v4\n\t"
|
||||||
"vag %%v6,%%v6,%%v4\n\t"
|
"vag %%v6,%%v6,%%v4\n\t"
|
||||||
|
|
||||||
"vfchesb %%v7,%%v0,%%v16\n\t"
|
"vfchesb %%v7,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
||||||
"vsegf %%v8,%%v7\n\t"
|
"vsegf %%v8,%%v7\n\t"
|
||||||
|
|
@ -207,10 +184,8 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
||||||
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"veslg %%v3,%%v0,32\n\t"
|
"veslg %%v3,%%v0,32\n\t"
|
||||||
"vfchsb %%v4,%%v0,%%v3\n\t"
|
"vfchsb %%v4,%%v0,%%v3\n\t"
|
||||||
"vchlg %%v5,%%v2,%%v1\n\t"
|
"vchlg %%v5,%%v2,%%v1\n\t"
|
||||||
|
|
@ -221,14 +196,13 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vesrlg %%v4,%%v4,32\n\t"
|
"vesrlg %%v4,%%v4,32\n\t"
|
||||||
"vsegf %%v4,%%v4\n\t"
|
"vsegf %%v4,%%v4\n\t"
|
||||||
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
|
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
|
||||||
|
|
||||||
"vrepf %%v2,%%v0,2\n\t"
|
"vrepf %%v2,%%v0,2\n\t"
|
||||||
"vrepg %%v3,%%v1,1\n\t"
|
"vrepg %%v3,%%v1,1\n\t"
|
||||||
"wfcsb %%v2,%%v0\n\t"
|
"wfcsb %%v2,%%v0\n\t"
|
||||||
"jne 1f\n\t"
|
"jne 1f\n\t"
|
||||||
"vstef %%v0,%1,0 \n\t"
|
"vstef %%v0,%[amax],0\n\t"
|
||||||
"vmnlg %%v0,%%v1,%%v3\n\t"
|
"vmnlg %%v0,%%v1,%%v3\n\t"
|
||||||
"vlgvg %0,%%v0,0 \n\t"
|
"vlgvg %[iamax],%%v0,0\n\t"
|
||||||
"j 2f\n\t"
|
"j 2f\n\t"
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"wfchsb %%v4,%%v2,%%v0\n\t"
|
"wfchsb %%v4,%%v2,%%v0\n\t"
|
||||||
|
|
@ -236,27 +210,28 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vsegf %%v4,%%v4\n\t"
|
"vsegf %%v4,%%v4\n\t"
|
||||||
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
||||||
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
||||||
"ste %%f0,%1 \n\t"
|
"ste %%f0,%[amax]\n\t"
|
||||||
"vlgvg %0,%%v1,0 \n\t"
|
"vlgvg %[iamax],%%v1,0\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:"=r"(iamax),"=m"(*amax)
|
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
|
||||||
);
|
"v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
|
||||||
|
"v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return iamax;
|
return iamax;
|
||||||
}
|
}
|
||||||
|
|
||||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
{
|
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0;
|
BLASLONG ix = 0;
|
||||||
FLOAT maxf = 0;
|
FLOAT maxf = 0;
|
||||||
BLASLONG max = 0;
|
BLASLONG max = 0;
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return(max);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (max);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -266,18 +241,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
max = icamax_kernel_32(n1, x, &maxf);
|
max = icamax_kernel_32(n1, x, &maxf);
|
||||||
ix = n1 * 2;
|
ix = n1 * 2;
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = CABS1(x, 0);
|
maxf = CABS1(x, 0);
|
||||||
ix += 2;
|
ix += 2;
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
if (CABS1(x, ix) > maxf) {
|
||||||
if( CABS1(x,ix) > maxf )
|
|
||||||
{
|
|
||||||
max = i;
|
max = i;
|
||||||
maxf = CABS1(x, ix);
|
maxf = CABS1(x, ix);
|
||||||
}
|
}
|
||||||
|
|
@ -291,13 +262,35 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
max = 0;
|
max = 0;
|
||||||
maxf = CABS1(x, 0);
|
maxf = CABS1(x, 0);
|
||||||
inc_x2 = 2 * inc_x;
|
inc_x2 = 2 * inc_x;
|
||||||
ix += inc_x2;
|
|
||||||
i++;
|
|
||||||
|
|
||||||
while(i < n)
|
BLASLONG n1 = n & -4;
|
||||||
{
|
while (i < n1) {
|
||||||
if( CABS1(x,ix) > maxf )
|
|
||||||
{
|
if (CABS1(x, ix) > maxf) {
|
||||||
|
max = i;
|
||||||
|
maxf = CABS1(x, ix);
|
||||||
|
}
|
||||||
|
if (CABS1(x, ix + inc_x2) > maxf) {
|
||||||
|
max = i + 1;
|
||||||
|
maxf = CABS1(x, ix + inc_x2);
|
||||||
|
}
|
||||||
|
if (CABS1(x, ix + 2 * inc_x2) > maxf) {
|
||||||
|
max = i + 2;
|
||||||
|
maxf = CABS1(x, ix + 2 * inc_x2);
|
||||||
|
}
|
||||||
|
if (CABS1(x, ix + 3 * inc_x2) > maxf) {
|
||||||
|
max = i + 3;
|
||||||
|
maxf = CABS1(x, ix + 3 * inc_x2);
|
||||||
|
}
|
||||||
|
|
||||||
|
ix += inc_x2 * 4;
|
||||||
|
|
||||||
|
i += 4;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
while (i < n) {
|
||||||
|
if (CABS1(x, ix) > maxf) {
|
||||||
max = i;
|
max = i;
|
||||||
maxf = CABS1(x, ix);
|
maxf = CABS1(x, ix);
|
||||||
}
|
}
|
||||||
|
|
@ -307,5 +300,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
return (max + 1);
|
return (max + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2017, The OpenBLAS Project
|
Copyright (c) 2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,26 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1]))
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
|
||||||
|
|
||||||
static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
|
||||||
{
|
|
||||||
BLASLONG iamin;
|
BLASLONG iamin;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vlef %%v0,0(%[x]),0\n\t"
|
||||||
"vlef %%v0,0(%3),0 \n\t"
|
"vlef %%v1,4(%[x]),0\n\t"
|
||||||
"vlef %%v1,4(%3),0 \n\t"
|
"vlef %%v0,8(%[x]),1\n\t"
|
||||||
"vlef %%v0,8(%3),1 \n\t"
|
"vlef %%v1,12(%[x]),1\n\t"
|
||||||
"vlef %%v1,12(%3),1 \n\t"
|
"vlef %%v0,16(%[x]),2\n\t"
|
||||||
"vlef %%v0,16(%3),2 \n\t"
|
"vlef %%v1,20(%[x]),2\n\t"
|
||||||
"vlef %%v1,20(%3),2 \n\t"
|
"vlef %%v0,24(%[x]),3\n\t"
|
||||||
"vlef %%v0,24(%3),3 \n\t"
|
"vlef %%v1,28(%[x]),3\n\t"
|
||||||
"vlef %%v1,28(%3),3 \n\t"
|
|
||||||
"vflpsb %%v0,%%v0\n\t"
|
"vflpsb %%v0,%%v0\n\t"
|
||||||
"vflpsb %%v1,%%v1\n\t"
|
"vflpsb %%v1,%%v1\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v1\n\t"
|
"vfasb %%v0,%%v0,%%v1\n\t"
|
||||||
|
|
@ -89,31 +82,26 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vleif %%v27,13,1\n\t"
|
"vleif %%v27,13,1\n\t"
|
||||||
"vleif %%v27,14,2\n\t"
|
"vleif %%v27,14,2\n\t"
|
||||||
"vleif %%v27,15,3\n\t"
|
"vleif %%v27,15,3\n\t"
|
||||||
"srlg %%r0,%2,5 \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%3) \n\t"
|
"vl %%v28,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,16(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v17,%%v16,%%v28\n\t"
|
"vpkg %%v17,%%v16,%%v28\n\t"
|
||||||
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
|
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
|
||||||
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%3) \n\t"
|
"vl %%v29,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v29,48(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v19,%%v18,%%v29\n\t"
|
"vpkg %%v19,%%v18,%%v29\n\t"
|
||||||
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
|
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
|
||||||
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%3) \n\t"
|
"vl %%v30,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,80(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v21,%%v20,%%v30\n\t"
|
"vpkg %%v21,%%v20,%%v30\n\t"
|
||||||
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
|
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
|
||||||
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%3) \n\t"
|
"vl %%v31,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v31,112(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v23,%%v22,%%v31\n\t"
|
"vpkg %%v23,%%v22,%%v31\n\t"
|
||||||
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
|
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
|
||||||
|
|
||||||
"vflpsb %%v16, %%v16\n\t"
|
"vflpsb %%v16, %%v16\n\t"
|
||||||
"vflpsb %%v17, %%v17\n\t"
|
"vflpsb %%v17, %%v17\n\t"
|
||||||
"vflpsb %%v18, %%v18\n\t"
|
"vflpsb %%v18, %%v18\n\t"
|
||||||
|
|
@ -126,14 +114,12 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vfasb %%v17,%%v18,%%v19\n\t"
|
"vfasb %%v17,%%v18,%%v19\n\t"
|
||||||
"vfasb %%v18,%%v20,%%v21\n\t"
|
"vfasb %%v18,%%v20,%%v21\n\t"
|
||||||
"vfasb %%v19,%%v22,%%v23\n\t"
|
"vfasb %%v19,%%v22,%%v23\n\t"
|
||||||
|
|
||||||
"vfchesb %%v5,%%v17,%%v16\n\t"
|
"vfchesb %%v5,%%v17,%%v16\n\t"
|
||||||
"vfchesb %%v6,%%v19,%%v18\n\t"
|
"vfchesb %%v6,%%v19,%%v18\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
|
||||||
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
|
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
|
||||||
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
|
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
|
||||||
|
|
||||||
"vfchesb %%v18,%%v17,%%v16\n\t"
|
"vfchesb %%v18,%%v17,%%v16\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
||||||
|
|
@ -141,7 +127,6 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vesrlg %%v5,%%v5,32\n\t"
|
"vesrlg %%v5,%%v5,32\n\t"
|
||||||
"vag %%v5,%%v5,%%v4\n\t"
|
"vag %%v5,%%v5,%%v4\n\t"
|
||||||
"vag %%v6,%%v6,%%v4\n\t"
|
"vag %%v6,%%v6,%%v4\n\t"
|
||||||
|
|
||||||
"vfchesb %%v7,%%v16,%%v0\n\t"
|
"vfchesb %%v7,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
||||||
"vsegf %%v8,%%v7\n\t"
|
"vsegf %%v8,%%v7\n\t"
|
||||||
|
|
@ -150,27 +135,22 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
||||||
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%3) \n\t"
|
"vl %%v28,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,144(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v17,%%v16,%%v28\n\t"
|
"vpkg %%v17,%%v16,%%v28\n\t"
|
||||||
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
|
"vperm %%v16,%%v16,%%v28,%%v9\n\t"
|
||||||
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%3) \n\t"
|
"vl %%v29,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v29,176(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v19,%%v18,%%v29\n\t"
|
"vpkg %%v19,%%v18,%%v29\n\t"
|
||||||
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
|
"vperm %%v18,%%v18,%%v29,%%v9\n\t"
|
||||||
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%3) \n\t"
|
"vl %%v30,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,208(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v21,%%v20,%%v30\n\t"
|
"vpkg %%v21,%%v20,%%v30\n\t"
|
||||||
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
|
"vperm %%v20,%%v20,%%v30,%%v9\n\t"
|
||||||
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%3) \n\t"
|
"vl %%v31,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31,240(%%r1,%3) \n\t"
|
|
||||||
"vpkg %%v23,%%v22,%%v31\n\t"
|
"vpkg %%v23,%%v22,%%v31\n\t"
|
||||||
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
|
"vperm %%v22,%%v22,%%v31,%%v9\n\t"
|
||||||
|
|
||||||
"vflpsb %%v16, %%v16\n\t"
|
"vflpsb %%v16, %%v16\n\t"
|
||||||
"vflpsb %%v17, %%v17\n\t"
|
"vflpsb %%v17, %%v17\n\t"
|
||||||
"vflpsb %%v18, %%v18\n\t"
|
"vflpsb %%v18, %%v18\n\t"
|
||||||
|
|
@ -183,14 +163,12 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vfasb %%v17,%%v18,%%v19\n\t"
|
"vfasb %%v17,%%v18,%%v19\n\t"
|
||||||
"vfasb %%v18,%%v20,%%v21\n\t"
|
"vfasb %%v18,%%v20,%%v21\n\t"
|
||||||
"vfasb %%v19,%%v22,%%v23\n\t"
|
"vfasb %%v19,%%v22,%%v23\n\t"
|
||||||
|
|
||||||
"vfchesb %%v5,%%v17,%%v16\n\t"
|
"vfchesb %%v5,%%v17,%%v16\n\t"
|
||||||
"vfchesb %%v6,%%v19,%%v18\n\t"
|
"vfchesb %%v6,%%v19,%%v18\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v5\n\t"
|
||||||
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
|
"vsel %%v5,%%v24,%%v25,%%v5\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v6\n\t"
|
||||||
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
|
"vsel %%v6,%%v26,%%v27,%%v6\n\t"
|
||||||
|
|
||||||
"vfchesb %%v18,%%v17,%%v16\n\t"
|
"vfchesb %%v18,%%v17,%%v16\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
||||||
|
|
@ -198,7 +176,6 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vesrlg %%v5,%%v5,32\n\t"
|
"vesrlg %%v5,%%v5,32\n\t"
|
||||||
"vag %%v5,%%v5,%%v4\n\t"
|
"vag %%v5,%%v5,%%v4\n\t"
|
||||||
"vag %%v6,%%v6,%%v4\n\t"
|
"vag %%v6,%%v6,%%v4\n\t"
|
||||||
|
|
||||||
"vfchesb %%v7,%%v16,%%v0\n\t"
|
"vfchesb %%v7,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
||||||
"vsegf %%v8,%%v7\n\t"
|
"vsegf %%v8,%%v7\n\t"
|
||||||
|
|
@ -207,10 +184,8 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
||||||
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"veslg %%v3,%%v0,32\n\t"
|
"veslg %%v3,%%v0,32\n\t"
|
||||||
"vfchsb %%v4,%%v3,%%v0\n\t"
|
"vfchsb %%v4,%%v3,%%v0\n\t"
|
||||||
"vchlg %%v5,%%v2,%%v1\n\t"
|
"vchlg %%v5,%%v2,%%v1\n\t"
|
||||||
|
|
@ -221,14 +196,13 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vesrlg %%v4,%%v4,32\n\t"
|
"vesrlg %%v4,%%v4,32\n\t"
|
||||||
"vsegf %%v4,%%v4\n\t"
|
"vsegf %%v4,%%v4\n\t"
|
||||||
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
|
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
|
||||||
|
|
||||||
"vrepf %%v2,%%v0,2\n\t"
|
"vrepf %%v2,%%v0,2\n\t"
|
||||||
"vrepg %%v3,%%v1,1\n\t"
|
"vrepg %%v3,%%v1,1\n\t"
|
||||||
"wfcsb %%v2,%%v0\n\t"
|
"wfcsb %%v2,%%v0\n\t"
|
||||||
"jne 1f\n\t"
|
"jne 1f\n\t"
|
||||||
"vstef %%v0,%1,0 \n\t"
|
"vstef %%v0,%[amin],0\n\t"
|
||||||
"vmnlg %%v0,%%v1,%%v3\n\t"
|
"vmnlg %%v0,%%v1,%%v3\n\t"
|
||||||
"vlgvg %0,%%v0,0 \n\t"
|
"vlgvg %[iamin],%%v0,0\n\t"
|
||||||
"j 2f\n\t"
|
"j 2f\n\t"
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"wfchsb %%v4,%%v0,%%v2\n\t"
|
"wfchsb %%v4,%%v0,%%v2\n\t"
|
||||||
|
|
@ -236,27 +210,28 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vsegf %%v4,%%v4\n\t"
|
"vsegf %%v4,%%v4\n\t"
|
||||||
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
||||||
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
||||||
"ste %%f0,%1 \n\t"
|
"ste %%f0,%[amin]\n\t"
|
||||||
"vlgvg %0,%%v1,0 \n\t"
|
"vlgvg %[iamin],%%v1,0\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:"=r"(iamin),"=m"(*amin)
|
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
|
||||||
);
|
"v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
|
||||||
|
"v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return iamin;
|
return iamin;
|
||||||
}
|
}
|
||||||
|
|
||||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
{
|
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0;
|
BLASLONG ix = 0;
|
||||||
FLOAT minf = 0;
|
FLOAT minf = 0;
|
||||||
BLASLONG min = 0;
|
BLASLONG min = 0;
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return(min);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (min);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -266,18 +241,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
min = icamin_kernel_32(n1, x, &minf);
|
min = icamin_kernel_32(n1, x, &minf);
|
||||||
ix = n1 * 2;
|
ix = n1 * 2;
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = CABS1(x, 0);
|
minf = CABS1(x, 0);
|
||||||
ix += 2;
|
ix += 2;
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
if (CABS1(x, ix) < minf) {
|
||||||
if( CABS1(x,ix) < minf )
|
|
||||||
{
|
|
||||||
min = i;
|
min = i;
|
||||||
minf = CABS1(x, ix);
|
minf = CABS1(x, ix);
|
||||||
}
|
}
|
||||||
|
|
@ -291,13 +262,35 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
min = 0;
|
min = 0;
|
||||||
minf = CABS1(x, 0);
|
minf = CABS1(x, 0);
|
||||||
inc_x2 = 2 * inc_x;
|
inc_x2 = 2 * inc_x;
|
||||||
ix += inc_x2;
|
|
||||||
i++;
|
|
||||||
|
|
||||||
while(i < n)
|
BLASLONG n1 = n & -4;
|
||||||
{
|
while (i < n1) {
|
||||||
if( CABS1(x,ix) < minf )
|
|
||||||
{
|
if (CABS1(x, ix) < minf) {
|
||||||
|
min = i;
|
||||||
|
minf = CABS1(x, ix);
|
||||||
|
}
|
||||||
|
if (CABS1(x, ix + inc_x2) < minf) {
|
||||||
|
min = i + 1;
|
||||||
|
minf = CABS1(x, ix + inc_x2);
|
||||||
|
}
|
||||||
|
if (CABS1(x, ix + 2 * inc_x2) < minf) {
|
||||||
|
min = i + 2;
|
||||||
|
minf = CABS1(x, ix + 2 * inc_x2);
|
||||||
|
}
|
||||||
|
if (CABS1(x, ix + 3 * inc_x2) < minf) {
|
||||||
|
min = i + 3;
|
||||||
|
minf = CABS1(x, ix + 3 * inc_x2);
|
||||||
|
}
|
||||||
|
|
||||||
|
ix += inc_x2 * 4;
|
||||||
|
|
||||||
|
i += 4;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
while (i < n) {
|
||||||
|
if (CABS1(x, ix) < minf) {
|
||||||
min = i;
|
min = i;
|
||||||
minf = CABS1(x, ix);
|
minf = CABS1(x, ix);
|
||||||
}
|
}
|
||||||
|
|
@ -307,5 +300,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
return (min + 1);
|
return (min + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,18 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
#define ABS fabs
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) {
|
||||||
{
|
|
||||||
BLASLONG iamax;
|
BLASLONG iamax;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%3) \n\t"
|
|
||||||
"vflpdb %%v0,%%v0\n\t"
|
"vflpdb %%v0,%%v0\n\t"
|
||||||
"vleig %%v1,0,0\n\t"
|
"vleig %%v1,0,0\n\t"
|
||||||
"vleig %%v1,1,1\n\t"
|
"vleig %%v1,1,1\n\t"
|
||||||
|
|
@ -61,19 +55,18 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vleig %%v30,13,1\n\t"
|
"vleig %%v30,13,1\n\t"
|
||||||
"vleig %%v31,14,0\n\t"
|
"vleig %%v31,14,0\n\t"
|
||||||
"vleig %%v31,15,1\n\t"
|
"vleig %%v31,15,1\n\t"
|
||||||
"srlg %%r0,%2,5 \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%3) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%3) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%3) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%3) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%3) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%3) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%3) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%3) \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -82,7 +75,6 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vflpdb %%v21, %%v21\n\t"
|
"vflpdb %%v21, %%v21\n\t"
|
||||||
"vflpdb %%v22, %%v22\n\t"
|
"vflpdb %%v22, %%v22\n\t"
|
||||||
"vflpdb %%v23, %%v23\n\t"
|
"vflpdb %%v23, %%v23\n\t"
|
||||||
|
|
||||||
"vfchedb %%v4,%%v16,%%v17\n\t"
|
"vfchedb %%v4,%%v16,%%v17\n\t"
|
||||||
"vfchedb %%v5,%%v18,%%v19\n\t"
|
"vfchedb %%v5,%%v18,%%v19\n\t"
|
||||||
"vfchedb %%v6,%%v20,%%v21\n\t"
|
"vfchedb %%v6,%%v20,%%v21\n\t"
|
||||||
|
|
@ -95,32 +87,28 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
||||||
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
||||||
|
|
||||||
"vfchedb %%v20,%%v16,%%v17\n\t"
|
"vfchedb %%v20,%%v16,%%v17\n\t"
|
||||||
"vfchedb %%v21,%%v18,%%v19\n\t"
|
"vfchedb %%v21,%%v18,%%v19\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
||||||
|
|
||||||
"vfchedb %%v18,%%v16,%%v17\n\t"
|
"vfchedb %%v18,%%v16,%%v17\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"vfchedb %%v5,%%v0,%%v16\n\t"
|
"vfchedb %%v5,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
||||||
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
||||||
"vag %%v3,%%v3,%%v2\n\t"
|
"vag %%v3,%%v3,%%v2\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%3) \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,144(%%r1,%3) \n\t"
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%3) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,176(%%r1,%3) \n\t"
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%3) \n\t"
|
"vl %%v21,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,208(%%r1,%3) \n\t"
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%3) \n\t"
|
"vl %%v23,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,240(%%r1,%3) \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -129,7 +117,6 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vflpdb %%v21, %%v21\n\t"
|
"vflpdb %%v21, %%v21\n\t"
|
||||||
"vflpdb %%v22, %%v22\n\t"
|
"vflpdb %%v22, %%v22\n\t"
|
||||||
"vflpdb %%v23, %%v23\n\t"
|
"vflpdb %%v23, %%v23\n\t"
|
||||||
|
|
||||||
"vfchedb %%v4,%%v16,%%v17\n\t"
|
"vfchedb %%v4,%%v16,%%v17\n\t"
|
||||||
"vfchedb %%v5,%%v18,%%v19\n\t"
|
"vfchedb %%v5,%%v18,%%v19\n\t"
|
||||||
"vfchedb %%v6,%%v20,%%v21\n\t"
|
"vfchedb %%v6,%%v20,%%v21\n\t"
|
||||||
|
|
@ -142,47 +129,43 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
||||||
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
||||||
|
|
||||||
"vfchedb %%v20,%%v16,%%v17\n\t"
|
"vfchedb %%v20,%%v16,%%v17\n\t"
|
||||||
"vfchedb %%v21,%%v18,%%v19\n\t"
|
"vfchedb %%v21,%%v18,%%v19\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
||||||
|
|
||||||
"vfchedb %%v18,%%v16,%%v17\n\t"
|
"vfchedb %%v18,%%v16,%%v17\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"vfchedb %%v5,%%v0,%%v16\n\t"
|
"vfchedb %%v5,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
||||||
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
||||||
"vag %%v3,%%v3,%%v2\n\t"
|
"vag %%v3,%%v3,%%v2\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v2,%%v0,1\n\t"
|
"vrepg %%v2,%%v0,1\n\t"
|
||||||
"vrepg %%v3,%%v1,1\n\t"
|
"vrepg %%v3,%%v1,1\n\t"
|
||||||
"wfcdb %%v2,%%v0\n\t"
|
"wfcdb %%v2,%%v0\n\t"
|
||||||
"jne 1f\n\t"
|
"jne 1f\n\t"
|
||||||
"vsteg %%v0,%1,0 \n\t"
|
"vsteg %%v0,%[amax],0\n\t"
|
||||||
"vmnlg %%v0,%%v1,%%v3\n\t"
|
"vmnlg %%v0,%%v1,%%v3\n\t"
|
||||||
"vlgvg %0,%%v0,0 \n\t"
|
"vlgvg %[iamax],%%v0,0\n\t"
|
||||||
"j 2f\n\t"
|
"j 2f\n\t"
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"wfchdb %%v4,%%v2,%%v0\n\t"
|
"wfchdb %%v4,%%v2,%%v0\n\t"
|
||||||
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
||||||
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
||||||
"std %%f0,%1 \n\t"
|
"std %%f0,%[amax]\n\t"
|
||||||
"vlgvg %0,%%v1,0 \n\t"
|
"vlgvg %[iamax],%%v1,0\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:"=r"(iamax),"=m"(*amax)
|
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
);
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return iamax;
|
return iamax;
|
||||||
}
|
}
|
||||||
|
|
@ -193,7 +176,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT maxf = 0.0;
|
FLOAT maxf = 0.0;
|
||||||
BLASLONG max = 0;
|
BLASLONG max = 0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (max);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (max);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -203,9 +187,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
max = idamax_kernel_32(n1, x, &maxf);
|
max = idamax_kernel_32(n1, x, &maxf);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = ABS(x[0]);
|
maxf = ABS(x[0]);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -250,7 +232,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (ABS(x[i]) > maxf) {
|
if (ABS(x[i]) > maxf) {
|
||||||
max = j;
|
max = j;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,18 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
#define ABS fabs
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) {
|
||||||
{
|
|
||||||
BLASLONG iamin;
|
BLASLONG iamin;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%3) \n\t"
|
|
||||||
"vflpdb %%v0,%%v0\n\t"
|
"vflpdb %%v0,%%v0\n\t"
|
||||||
"vleig %%v1,0,0\n\t"
|
"vleig %%v1,0,0\n\t"
|
||||||
"vleig %%v1,1,1\n\t"
|
"vleig %%v1,1,1\n\t"
|
||||||
|
|
@ -61,19 +55,18 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vleig %%v30,13,1\n\t"
|
"vleig %%v30,13,1\n\t"
|
||||||
"vleig %%v31,14,0\n\t"
|
"vleig %%v31,14,0\n\t"
|
||||||
"vleig %%v31,15,1\n\t"
|
"vleig %%v31,15,1\n\t"
|
||||||
"srlg %%r0,%2,5 \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%3) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%3) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%3) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%3) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%3) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%3) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%3) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%3) \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -82,7 +75,6 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vflpdb %%v21, %%v21\n\t"
|
"vflpdb %%v21, %%v21\n\t"
|
||||||
"vflpdb %%v22, %%v22\n\t"
|
"vflpdb %%v22, %%v22\n\t"
|
||||||
"vflpdb %%v23, %%v23\n\t"
|
"vflpdb %%v23, %%v23\n\t"
|
||||||
|
|
||||||
"vfchedb %%v4,%%v17,%%v16\n\t"
|
"vfchedb %%v4,%%v17,%%v16\n\t"
|
||||||
"vfchedb %%v5,%%v19,%%v18\n\t"
|
"vfchedb %%v5,%%v19,%%v18\n\t"
|
||||||
"vfchedb %%v6,%%v21,%%v20\n\t"
|
"vfchedb %%v6,%%v21,%%v20\n\t"
|
||||||
|
|
@ -95,32 +87,28 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
||||||
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
||||||
|
|
||||||
"vfchedb %%v20,%%v17,%%v16\n\t"
|
"vfchedb %%v20,%%v17,%%v16\n\t"
|
||||||
"vfchedb %%v21,%%v19,%%v18\n\t"
|
"vfchedb %%v21,%%v19,%%v18\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
||||||
|
|
||||||
"vfchedb %%v18,%%v17,%%v16\n\t"
|
"vfchedb %%v18,%%v17,%%v16\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"vfchedb %%v5,%%v16,%%v0\n\t"
|
"vfchedb %%v5,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
||||||
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
||||||
"vag %%v3,%%v3,%%v2\n\t"
|
"vag %%v3,%%v3,%%v2\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%3) \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,144(%%r1,%3) \n\t"
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%3) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,176(%%r1,%3) \n\t"
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%3) \n\t"
|
"vl %%v21,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,208(%%r1,%3) \n\t"
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%3) \n\t"
|
"vl %%v23,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,240(%%r1,%3) \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -129,7 +117,6 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vflpdb %%v21, %%v21\n\t"
|
"vflpdb %%v21, %%v21\n\t"
|
||||||
"vflpdb %%v22, %%v22\n\t"
|
"vflpdb %%v22, %%v22\n\t"
|
||||||
"vflpdb %%v23, %%v23\n\t"
|
"vflpdb %%v23, %%v23\n\t"
|
||||||
|
|
||||||
"vfchedb %%v4,%%v17,%%v16\n\t"
|
"vfchedb %%v4,%%v17,%%v16\n\t"
|
||||||
"vfchedb %%v5,%%v19,%%v18\n\t"
|
"vfchedb %%v5,%%v19,%%v18\n\t"
|
||||||
"vfchedb %%v6,%%v21,%%v20\n\t"
|
"vfchedb %%v6,%%v21,%%v20\n\t"
|
||||||
|
|
@ -142,47 +129,43 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
||||||
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
||||||
|
|
||||||
"vfchedb %%v20,%%v17,%%v16\n\t"
|
"vfchedb %%v20,%%v17,%%v16\n\t"
|
||||||
"vfchedb %%v21,%%v19,%%v18\n\t"
|
"vfchedb %%v21,%%v19,%%v18\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
||||||
|
|
||||||
"vfchedb %%v18,%%v17,%%v16\n\t"
|
"vfchedb %%v18,%%v17,%%v16\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"vfchedb %%v5,%%v16,%%v0\n\t"
|
"vfchedb %%v5,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
||||||
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
||||||
"vag %%v3,%%v3,%%v2\n\t"
|
"vag %%v3,%%v3,%%v2\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v2,%%v0,1\n\t"
|
"vrepg %%v2,%%v0,1\n\t"
|
||||||
"vrepg %%v3,%%v1,1\n\t"
|
"vrepg %%v3,%%v1,1\n\t"
|
||||||
"wfcdb %%v2,%%v0\n\t"
|
"wfcdb %%v2,%%v0\n\t"
|
||||||
"jne 1f\n\t"
|
"jne 1f\n\t"
|
||||||
"vsteg %%v0,%1,0 \n\t"
|
"vsteg %%v0,%[amin],0\n\t"
|
||||||
"vmnlg %%v0,%%v1,%%v3\n\t"
|
"vmnlg %%v0,%%v1,%%v3\n\t"
|
||||||
"vlgvg %0,%%v0,0 \n\t"
|
"vlgvg %[iamin],%%v0,0\n\t"
|
||||||
"j 2f\n\t"
|
"j 2f\n\t"
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"wfchdb %%v4,%%v0,%%v2\n\t"
|
"wfchdb %%v4,%%v0,%%v2\n\t"
|
||||||
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
||||||
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
||||||
"std %%f0,%1 \n\t"
|
"std %%f0,%[amin]\n\t"
|
||||||
"vlgvg %0,%%v1,0 \n\t"
|
"vlgvg %[iamin],%%v1,0\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:"=r"(iamin),"=m"(*amin)
|
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
);
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return iamin;
|
return iamin;
|
||||||
}
|
}
|
||||||
|
|
@ -193,7 +176,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT minf = 0.0;
|
FLOAT minf = 0.0;
|
||||||
BLASLONG min = 0;
|
BLASLONG min = 0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (min);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (min);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -203,9 +187,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
min = idamin_kernel_32(n1, x, &minf);
|
min = idamin_kernel_32(n1, x, &minf);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = ABS(x[0]);
|
minf = ABS(x[0]);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -250,7 +232,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (ABS(x[i]) < minf) {
|
if (ABS(x[i]) < minf) {
|
||||||
min = j;
|
min = j;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,12 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
|
static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) {
|
||||||
{
|
|
||||||
BLASLONG imax;
|
BLASLONG imax;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%3) \n\t"
|
|
||||||
"vleig %%v1,0,0\n\t"
|
"vleig %%v1,0,0\n\t"
|
||||||
"vleig %%v1,1,1\n\t"
|
"vleig %%v1,1,1\n\t"
|
||||||
"vrepig %%v2,16\n\t"
|
"vrepig %%v2,16\n\t"
|
||||||
|
|
@ -53,20 +51,18 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||||
"vleig %%v30,13,1\n\t"
|
"vleig %%v30,13,1\n\t"
|
||||||
"vleig %%v31,14,0\n\t"
|
"vleig %%v31,14,0\n\t"
|
||||||
"vleig %%v31,15,1\n\t"
|
"vleig %%v31,15,1\n\t"
|
||||||
"srlg %%r0,%2,5 \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%3) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%3) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%3) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%3) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%3) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%3) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%3) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"vfchedb %%v4,%%v16,%%v17\n\t"
|
"vfchedb %%v4,%%v16,%%v17\n\t"
|
||||||
"vfchedb %%v5,%%v18,%%v19\n\t"
|
"vfchedb %%v5,%%v18,%%v19\n\t"
|
||||||
"vfchedb %%v6,%%v20,%%v21\n\t"
|
"vfchedb %%v6,%%v20,%%v21\n\t"
|
||||||
|
|
@ -79,33 +75,28 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||||
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
||||||
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
||||||
|
|
||||||
"vfchedb %%v20,%%v16,%%v17\n\t"
|
"vfchedb %%v20,%%v16,%%v17\n\t"
|
||||||
"vfchedb %%v21,%%v18,%%v19\n\t"
|
"vfchedb %%v21,%%v18,%%v19\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
||||||
|
|
||||||
"vfchedb %%v18,%%v16,%%v17\n\t"
|
"vfchedb %%v18,%%v16,%%v17\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"vfchedb %%v5,%%v0,%%v16\n\t"
|
"vfchedb %%v5,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
||||||
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
||||||
"vag %%v3,%%v3,%%v2\n\t"
|
"vag %%v3,%%v3,%%v2\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%3) \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,144(%%r1,%3) \n\t"
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%3) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,176(%%r1,%3) \n\t"
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%3) \n\t"
|
"vl %%v21,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,208(%%r1,%3) \n\t"
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%3) \n\t"
|
"vl %%v23,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,240(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"vfchedb %%v4,%%v16,%%v17\n\t"
|
"vfchedb %%v4,%%v16,%%v17\n\t"
|
||||||
"vfchedb %%v5,%%v18,%%v19\n\t"
|
"vfchedb %%v5,%%v18,%%v19\n\t"
|
||||||
"vfchedb %%v6,%%v20,%%v21\n\t"
|
"vfchedb %%v6,%%v20,%%v21\n\t"
|
||||||
|
|
@ -118,47 +109,43 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||||
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
||||||
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
||||||
|
|
||||||
"vfchedb %%v20,%%v16,%%v17\n\t"
|
"vfchedb %%v20,%%v16,%%v17\n\t"
|
||||||
"vfchedb %%v21,%%v18,%%v19\n\t"
|
"vfchedb %%v21,%%v18,%%v19\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
||||||
|
|
||||||
"vfchedb %%v18,%%v16,%%v17\n\t"
|
"vfchedb %%v18,%%v16,%%v17\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"vfchedb %%v5,%%v0,%%v16\n\t"
|
"vfchedb %%v5,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
||||||
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
||||||
"vag %%v3,%%v3,%%v2\n\t"
|
"vag %%v3,%%v3,%%v2\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v2,%%v0,1\n\t"
|
"vrepg %%v2,%%v0,1\n\t"
|
||||||
"vrepg %%v3,%%v1,1\n\t"
|
"vrepg %%v3,%%v1,1\n\t"
|
||||||
"wfcdb %%v2,%%v0\n\t"
|
"wfcdb %%v2,%%v0\n\t"
|
||||||
"jne 1f\n\t"
|
"jne 1f\n\t"
|
||||||
"vsteg %%v0,%1,0 \n\t"
|
"vsteg %%v0,%[max],0\n\t"
|
||||||
"vmnlg %%v0,%%v1,%%v3\n\t"
|
"vmnlg %%v0,%%v1,%%v3\n\t"
|
||||||
"vlgvg %0,%%v0,0 \n\t"
|
"vlgvg %[imax],%%v0,0\n\t"
|
||||||
"j 2f\n\t"
|
"j 2f\n\t"
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"wfchdb %%v4,%%v2,%%v0\n\t"
|
"wfchdb %%v4,%%v2,%%v0\n\t"
|
||||||
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
||||||
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
||||||
"std %%f0,%1 \n\t"
|
"std %%f0,%[max]\n\t"
|
||||||
"vlgvg %0,%%v1,0 \n\t"
|
"vlgvg %[imax],%%v1,0\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:"=r"(imax),"=m"(*max)
|
: [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
);
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return imax;
|
return imax;
|
||||||
}
|
}
|
||||||
|
|
@ -169,7 +156,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT maxf = 0.0;
|
FLOAT maxf = 0.0;
|
||||||
BLASLONG max = 0;
|
BLASLONG max = 0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (max);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (max);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -179,9 +167,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
max = idmax_kernel_32(n1, x, &maxf);
|
max = idmax_kernel_32(n1, x, &maxf);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = x[0];
|
maxf = x[0];
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -226,7 +212,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (x[i] > maxf) {
|
if (x[i] > maxf) {
|
||||||
max = j;
|
max = j;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,12 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
|
static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) {
|
||||||
{
|
|
||||||
BLASLONG imin;
|
BLASLONG imin;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%3) \n\t"
|
|
||||||
"vleig %%v1,0,0\n\t"
|
"vleig %%v1,0,0\n\t"
|
||||||
"vleig %%v1,1,1\n\t"
|
"vleig %%v1,1,1\n\t"
|
||||||
"vrepig %%v2,16\n\t"
|
"vrepig %%v2,16\n\t"
|
||||||
|
|
@ -53,20 +51,18 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||||
"vleig %%v30,13,1\n\t"
|
"vleig %%v30,13,1\n\t"
|
||||||
"vleig %%v31,14,0\n\t"
|
"vleig %%v31,14,0\n\t"
|
||||||
"vleig %%v31,15,1\n\t"
|
"vleig %%v31,15,1\n\t"
|
||||||
"srlg %%r0,%2,5 \n\t"
|
"srlg %[n],%[n],5\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%3) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%3) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%3) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%3) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%3) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%3) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%3) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"vfchedb %%v4,%%v17,%%v16\n\t"
|
"vfchedb %%v4,%%v17,%%v16\n\t"
|
||||||
"vfchedb %%v5,%%v19,%%v18\n\t"
|
"vfchedb %%v5,%%v19,%%v18\n\t"
|
||||||
"vfchedb %%v6,%%v21,%%v20\n\t"
|
"vfchedb %%v6,%%v21,%%v20\n\t"
|
||||||
|
|
@ -79,33 +75,28 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||||
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
||||||
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
||||||
|
|
||||||
"vfchedb %%v20,%%v17,%%v16\n\t"
|
"vfchedb %%v20,%%v17,%%v16\n\t"
|
||||||
"vfchedb %%v21,%%v19,%%v18\n\t"
|
"vfchedb %%v21,%%v19,%%v18\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
||||||
|
|
||||||
"vfchedb %%v18,%%v17,%%v16\n\t"
|
"vfchedb %%v18,%%v17,%%v16\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"vfchedb %%v5,%%v16,%%v0\n\t"
|
"vfchedb %%v5,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
||||||
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
||||||
"vag %%v3,%%v3,%%v2\n\t"
|
"vag %%v3,%%v3,%%v2\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%3) \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,144(%%r1,%3) \n\t"
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%3) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,176(%%r1,%3) \n\t"
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%3) \n\t"
|
"vl %%v21,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,208(%%r1,%3) \n\t"
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%3) \n\t"
|
"vl %%v23,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,240(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"vfchedb %%v4,%%v17,%%v16\n\t"
|
"vfchedb %%v4,%%v17,%%v16\n\t"
|
||||||
"vfchedb %%v5,%%v19,%%v18\n\t"
|
"vfchedb %%v5,%%v19,%%v18\n\t"
|
||||||
"vfchedb %%v6,%%v21,%%v20\n\t"
|
"vfchedb %%v6,%%v21,%%v20\n\t"
|
||||||
|
|
@ -118,47 +109,43 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||||
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
"vsel %%v6,%%v28,%%v29,%%v6\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v7\n\t"
|
||||||
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
"vsel %%v7,%%v30,%%v31,%%v7\n\t"
|
||||||
|
|
||||||
"vfchedb %%v20,%%v17,%%v16\n\t"
|
"vfchedb %%v20,%%v17,%%v16\n\t"
|
||||||
"vfchedb %%v21,%%v19,%%v18\n\t"
|
"vfchedb %%v21,%%v19,%%v18\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
"vsel %%v5,%%v6,%%v7,%%v21\n\t"
|
||||||
|
|
||||||
"vfchedb %%v18,%%v17,%%v16\n\t"
|
"vfchedb %%v18,%%v17,%%v16\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"vfchedb %%v5,%%v16,%%v0\n\t"
|
"vfchedb %%v5,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
||||||
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
||||||
"vag %%v3,%%v3,%%v2\n\t"
|
"vag %%v3,%%v3,%%v2\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v2,%%v0,1\n\t"
|
"vrepg %%v2,%%v0,1\n\t"
|
||||||
"vrepg %%v3,%%v1,1\n\t"
|
"vrepg %%v3,%%v1,1\n\t"
|
||||||
"wfcdb %%v2,%%v0\n\t"
|
"wfcdb %%v2,%%v0\n\t"
|
||||||
"jne 1f\n\t"
|
"jne 1f\n\t"
|
||||||
"vsteg %%v0,%1,0 \n\t"
|
"vsteg %%v0,%[min],0\n\t"
|
||||||
"vmnlg %%v0,%%v1,%%v3\n\t"
|
"vmnlg %%v0,%%v1,%%v3\n\t"
|
||||||
"vlgvg %0,%%v0,0 \n\t"
|
"vlgvg %[imin],%%v0,0\n\t"
|
||||||
"j 2f\n\t"
|
"j 2f\n\t"
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"wfchdb %%v4,%%v0,%%v2\n\t"
|
"wfchdb %%v4,%%v0,%%v2\n\t"
|
||||||
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
||||||
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
||||||
"std %%f0,%1 \n\t"
|
"std %%f0,%[min]\n\t"
|
||||||
"vlgvg %0,%%v1,0 \n\t"
|
"vlgvg %[imin],%%v1,0\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:"=r"(imin),"=m"(*min)
|
: [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
);
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return imin;
|
return imin;
|
||||||
}
|
}
|
||||||
|
|
@ -169,7 +156,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT minf = 0.0;
|
FLOAT minf = 0.0;
|
||||||
BLASLONG min = 0;
|
BLASLONG min = 0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (min);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (min);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -179,9 +167,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
min = idmin_kernel_32(n1, x, &minf);
|
min = idmin_kernel_32(n1, x, &minf);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = x[0];
|
minf = x[0];
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -226,7 +212,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (x[i] < minf) {
|
if (x[i] < minf) {
|
||||||
min = j;
|
min = j;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,18 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
#define ABS fabsf
|
||||||
#endif
|
|
||||||
|
|
||||||
static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) {
|
||||||
{
|
|
||||||
BLASLONG iamax;
|
BLASLONG iamax;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%3) \n\t"
|
|
||||||
"vflpsb %%v0,%%v0\n\t"
|
"vflpsb %%v0,%%v0\n\t"
|
||||||
"vleig %%v1,0,0\n\t"
|
"vleig %%v1,0,0\n\t"
|
||||||
"vleig %%v1,2,1\n\t"
|
"vleig %%v1,2,1\n\t"
|
||||||
|
|
@ -79,19 +73,18 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vleif %%v31,29,1\n\t"
|
"vleif %%v31,29,1\n\t"
|
||||||
"vleif %%v31,30,2\n\t"
|
"vleif %%v31,30,2\n\t"
|
||||||
"vleif %%v31,31,3\n\t"
|
"vleif %%v31,31,3\n\t"
|
||||||
"srlg %%r0,%2,6 \n\t"
|
"srlg %[n],%[n],6\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%3) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%3) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%3) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%3) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%3) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%3) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%3) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%3) \n\t"
|
|
||||||
"vflpsb %%v16, %%v16\n\t"
|
"vflpsb %%v16, %%v16\n\t"
|
||||||
"vflpsb %%v17, %%v17\n\t"
|
"vflpsb %%v17, %%v17\n\t"
|
||||||
"vflpsb %%v18, %%v18\n\t"
|
"vflpsb %%v18, %%v18\n\t"
|
||||||
|
|
@ -100,7 +93,6 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vflpsb %%v21, %%v21\n\t"
|
"vflpsb %%v21, %%v21\n\t"
|
||||||
"vflpsb %%v22, %%v22\n\t"
|
"vflpsb %%v22, %%v22\n\t"
|
||||||
"vflpsb %%v23, %%v23\n\t"
|
"vflpsb %%v23, %%v23\n\t"
|
||||||
|
|
||||||
"vfchesb %%v5,%%v16,%%v17\n\t"
|
"vfchesb %%v5,%%v16,%%v17\n\t"
|
||||||
"vfchesb %%v6,%%v18,%%v19\n\t"
|
"vfchesb %%v6,%%v18,%%v19\n\t"
|
||||||
"vfchesb %%v7,%%v20,%%v21\n\t"
|
"vfchesb %%v7,%%v20,%%v21\n\t"
|
||||||
|
|
@ -113,14 +105,12 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
||||||
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
||||||
|
|
||||||
"vfchesb %%v20,%%v16,%%v17\n\t"
|
"vfchesb %%v20,%%v16,%%v17\n\t"
|
||||||
"vfchesb %%v21,%%v18,%%v19\n\t"
|
"vfchesb %%v21,%%v18,%%v19\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
||||||
|
|
||||||
"vfchesb %%v18,%%v16,%%v17\n\t"
|
"vfchesb %%v18,%%v16,%%v17\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
||||||
|
|
@ -128,7 +118,6 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vesrlg %%v5,%%v5,32\n\t"
|
"vesrlg %%v5,%%v5,32\n\t"
|
||||||
"vag %%v5,%%v5,%%v4\n\t"
|
"vag %%v5,%%v5,%%v4\n\t"
|
||||||
"vag %%v6,%%v6,%%v4\n\t"
|
"vag %%v6,%%v6,%%v4\n\t"
|
||||||
|
|
||||||
"vfchesb %%v7,%%v0,%%v16\n\t"
|
"vfchesb %%v7,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
||||||
"vsegf %%v8,%%v7\n\t"
|
"vsegf %%v8,%%v7\n\t"
|
||||||
|
|
@ -137,15 +126,14 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
||||||
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%3) \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,144(%%r1,%3) \n\t"
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%3) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,176(%%r1,%3) \n\t"
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%3) \n\t"
|
"vl %%v21,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,208(%%r1,%3) \n\t"
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%3) \n\t"
|
"vl %%v23,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,240(%%r1,%3) \n\t"
|
|
||||||
"vflpsb %%v16, %%v16\n\t"
|
"vflpsb %%v16, %%v16\n\t"
|
||||||
"vflpsb %%v17, %%v17\n\t"
|
"vflpsb %%v17, %%v17\n\t"
|
||||||
"vflpsb %%v18, %%v18\n\t"
|
"vflpsb %%v18, %%v18\n\t"
|
||||||
|
|
@ -154,7 +142,6 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vflpsb %%v21, %%v21\n\t"
|
"vflpsb %%v21, %%v21\n\t"
|
||||||
"vflpsb %%v22, %%v22\n\t"
|
"vflpsb %%v22, %%v22\n\t"
|
||||||
"vflpsb %%v23, %%v23\n\t"
|
"vflpsb %%v23, %%v23\n\t"
|
||||||
|
|
||||||
"vfchesb %%v5,%%v16,%%v17\n\t"
|
"vfchesb %%v5,%%v16,%%v17\n\t"
|
||||||
"vfchesb %%v6,%%v18,%%v19\n\t"
|
"vfchesb %%v6,%%v18,%%v19\n\t"
|
||||||
"vfchesb %%v7,%%v20,%%v21\n\t"
|
"vfchesb %%v7,%%v20,%%v21\n\t"
|
||||||
|
|
@ -167,14 +154,12 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
||||||
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
||||||
|
|
||||||
"vfchesb %%v20,%%v16,%%v17\n\t"
|
"vfchesb %%v20,%%v16,%%v17\n\t"
|
||||||
"vfchesb %%v21,%%v18,%%v19\n\t"
|
"vfchesb %%v21,%%v18,%%v19\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
||||||
|
|
||||||
"vfchesb %%v18,%%v16,%%v17\n\t"
|
"vfchesb %%v18,%%v16,%%v17\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
||||||
|
|
@ -182,7 +167,6 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vesrlg %%v5,%%v5,32\n\t"
|
"vesrlg %%v5,%%v5,32\n\t"
|
||||||
"vag %%v5,%%v5,%%v4\n\t"
|
"vag %%v5,%%v5,%%v4\n\t"
|
||||||
"vag %%v6,%%v6,%%v4\n\t"
|
"vag %%v6,%%v6,%%v4\n\t"
|
||||||
|
|
||||||
"vfchesb %%v7,%%v0,%%v16\n\t"
|
"vfchesb %%v7,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
||||||
"vsegf %%v8,%%v7\n\t"
|
"vsegf %%v8,%%v7\n\t"
|
||||||
|
|
@ -191,10 +175,8 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
||||||
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"veslg %%v3,%%v0,32\n\t"
|
"veslg %%v3,%%v0,32\n\t"
|
||||||
"vfchsb %%v4,%%v0,%%v3\n\t"
|
"vfchsb %%v4,%%v0,%%v3\n\t"
|
||||||
"vchlg %%v5,%%v2,%%v1\n\t"
|
"vchlg %%v5,%%v2,%%v1\n\t"
|
||||||
|
|
@ -205,14 +187,13 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vesrlg %%v4,%%v4,32\n\t"
|
"vesrlg %%v4,%%v4,32\n\t"
|
||||||
"vsegf %%v4,%%v4\n\t"
|
"vsegf %%v4,%%v4\n\t"
|
||||||
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
|
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
|
||||||
|
|
||||||
"vrepf %%v2,%%v0,2\n\t"
|
"vrepf %%v2,%%v0,2\n\t"
|
||||||
"vrepg %%v3,%%v1,1\n\t"
|
"vrepg %%v3,%%v1,1\n\t"
|
||||||
"wfcsb %%v2,%%v0\n\t"
|
"wfcsb %%v2,%%v0\n\t"
|
||||||
"jne 1f\n\t"
|
"jne 1f\n\t"
|
||||||
"vstef %%v0,%1,0 \n\t"
|
"vstef %%v0,%[amax],0\n\t"
|
||||||
"vmnlg %%v0,%%v1,%%v3\n\t"
|
"vmnlg %%v0,%%v1,%%v3\n\t"
|
||||||
"vlgvg %0,%%v0,0 \n\t"
|
"vlgvg %[iamax],%%v0,0\n\t"
|
||||||
"j 2f\n\t"
|
"j 2f\n\t"
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"wfchsb %%v4,%%v2,%%v0\n\t"
|
"wfchsb %%v4,%%v2,%%v0\n\t"
|
||||||
|
|
@ -220,14 +201,15 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vsegf %%v4,%%v4\n\t"
|
"vsegf %%v4,%%v4\n\t"
|
||||||
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
||||||
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
||||||
"ste %%f0,%1 \n\t"
|
"ste %%f0,%[amax]\n\t"
|
||||||
"vlgvg %0,%%v1,0 \n\t"
|
"vlgvg %[iamax],%%v1,0\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:"=r"(iamax),"=m"(*amax)
|
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
|
||||||
);
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return iamax;
|
return iamax;
|
||||||
}
|
}
|
||||||
|
|
@ -238,7 +220,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT maxf = 0.0;
|
FLOAT maxf = 0.0;
|
||||||
BLASLONG max = 0;
|
BLASLONG max = 0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (max);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (max);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -248,9 +231,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
max = isamax_kernel_64(n1, x, &maxf);
|
max = isamax_kernel_64(n1, x, &maxf);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = ABS(x[0]);
|
maxf = ABS(x[0]);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -295,7 +276,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (ABS(x[i]) > maxf) {
|
if (ABS(x[i]) > maxf) {
|
||||||
max = j;
|
max = j;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,18 +28,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
#define ABS fabsf
|
||||||
#endif
|
|
||||||
|
|
||||||
static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) {
|
||||||
{
|
|
||||||
BLASLONG iamin;
|
BLASLONG iamin;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%3) \n\t"
|
|
||||||
"vflpsb %%v0,%%v0\n\t"
|
"vflpsb %%v0,%%v0\n\t"
|
||||||
"vleig %%v1,0,0\n\t"
|
"vleig %%v1,0,0\n\t"
|
||||||
"vleig %%v1,2,1\n\t"
|
"vleig %%v1,2,1\n\t"
|
||||||
|
|
@ -79,19 +73,18 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vleif %%v31,29,1\n\t"
|
"vleif %%v31,29,1\n\t"
|
||||||
"vleif %%v31,30,2\n\t"
|
"vleif %%v31,30,2\n\t"
|
||||||
"vleif %%v31,31,3\n\t"
|
"vleif %%v31,31,3\n\t"
|
||||||
"srlg %%r0,%2,6 \n\t"
|
"srlg %[n],%[n],6\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%3) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%3) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%3) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%3) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%3) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%3) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%3) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%3) \n\t"
|
|
||||||
"vflpsb %%v16, %%v16\n\t"
|
"vflpsb %%v16, %%v16\n\t"
|
||||||
"vflpsb %%v17, %%v17\n\t"
|
"vflpsb %%v17, %%v17\n\t"
|
||||||
"vflpsb %%v18, %%v18\n\t"
|
"vflpsb %%v18, %%v18\n\t"
|
||||||
|
|
@ -100,7 +93,6 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vflpsb %%v21, %%v21\n\t"
|
"vflpsb %%v21, %%v21\n\t"
|
||||||
"vflpsb %%v22, %%v22\n\t"
|
"vflpsb %%v22, %%v22\n\t"
|
||||||
"vflpsb %%v23, %%v23\n\t"
|
"vflpsb %%v23, %%v23\n\t"
|
||||||
|
|
||||||
"vfchesb %%v5,%%v17,%%v16\n\t"
|
"vfchesb %%v5,%%v17,%%v16\n\t"
|
||||||
"vfchesb %%v6,%%v19,%%v18\n\t"
|
"vfchesb %%v6,%%v19,%%v18\n\t"
|
||||||
"vfchesb %%v7,%%v21,%%v20\n\t"
|
"vfchesb %%v7,%%v21,%%v20\n\t"
|
||||||
|
|
@ -113,14 +105,12 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
||||||
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
||||||
|
|
||||||
"vfchesb %%v20,%%v17,%%v16\n\t"
|
"vfchesb %%v20,%%v17,%%v16\n\t"
|
||||||
"vfchesb %%v21,%%v19,%%v18\n\t"
|
"vfchesb %%v21,%%v19,%%v18\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
||||||
|
|
||||||
"vfchesb %%v18,%%v17,%%v16\n\t"
|
"vfchesb %%v18,%%v17,%%v16\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
||||||
|
|
@ -128,7 +118,6 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vesrlg %%v5,%%v5,32\n\t"
|
"vesrlg %%v5,%%v5,32\n\t"
|
||||||
"vag %%v5,%%v5,%%v4\n\t"
|
"vag %%v5,%%v5,%%v4\n\t"
|
||||||
"vag %%v6,%%v6,%%v4\n\t"
|
"vag %%v6,%%v6,%%v4\n\t"
|
||||||
|
|
||||||
"vfchesb %%v7,%%v16,%%v0\n\t"
|
"vfchesb %%v7,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
||||||
"vsegf %%v8,%%v7\n\t"
|
"vsegf %%v8,%%v7\n\t"
|
||||||
|
|
@ -137,15 +126,14 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
||||||
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%3) \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,144(%%r1,%3) \n\t"
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%3) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,176(%%r1,%3) \n\t"
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%3) \n\t"
|
"vl %%v21,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,208(%%r1,%3) \n\t"
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%3) \n\t"
|
"vl %%v23,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,240(%%r1,%3) \n\t"
|
|
||||||
"vflpsb %%v16, %%v16\n\t"
|
"vflpsb %%v16, %%v16\n\t"
|
||||||
"vflpsb %%v17, %%v17\n\t"
|
"vflpsb %%v17, %%v17\n\t"
|
||||||
"vflpsb %%v18, %%v18\n\t"
|
"vflpsb %%v18, %%v18\n\t"
|
||||||
|
|
@ -154,7 +142,6 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vflpsb %%v21, %%v21\n\t"
|
"vflpsb %%v21, %%v21\n\t"
|
||||||
"vflpsb %%v22, %%v22\n\t"
|
"vflpsb %%v22, %%v22\n\t"
|
||||||
"vflpsb %%v23, %%v23\n\t"
|
"vflpsb %%v23, %%v23\n\t"
|
||||||
|
|
||||||
"vfchesb %%v5,%%v17,%%v16\n\t"
|
"vfchesb %%v5,%%v17,%%v16\n\t"
|
||||||
"vfchesb %%v6,%%v19,%%v18\n\t"
|
"vfchesb %%v6,%%v19,%%v18\n\t"
|
||||||
"vfchesb %%v7,%%v21,%%v20\n\t"
|
"vfchesb %%v7,%%v21,%%v20\n\t"
|
||||||
|
|
@ -167,14 +154,12 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
||||||
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
||||||
|
|
||||||
"vfchesb %%v20,%%v17,%%v16\n\t"
|
"vfchesb %%v20,%%v17,%%v16\n\t"
|
||||||
"vfchesb %%v21,%%v19,%%v18\n\t"
|
"vfchesb %%v21,%%v19,%%v18\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
||||||
|
|
||||||
"vfchesb %%v18,%%v17,%%v16\n\t"
|
"vfchesb %%v18,%%v17,%%v16\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
||||||
|
|
@ -182,7 +167,6 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vesrlg %%v5,%%v5,32\n\t"
|
"vesrlg %%v5,%%v5,32\n\t"
|
||||||
"vag %%v5,%%v5,%%v4\n\t"
|
"vag %%v5,%%v5,%%v4\n\t"
|
||||||
"vag %%v6,%%v6,%%v4\n\t"
|
"vag %%v6,%%v6,%%v4\n\t"
|
||||||
|
|
||||||
"vfchesb %%v7,%%v16,%%v0\n\t"
|
"vfchesb %%v7,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
||||||
"vsegf %%v8,%%v7\n\t"
|
"vsegf %%v8,%%v7\n\t"
|
||||||
|
|
@ -191,10 +175,8 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
||||||
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"veslg %%v3,%%v0,32\n\t"
|
"veslg %%v3,%%v0,32\n\t"
|
||||||
"vfchsb %%v4,%%v3,%%v0\n\t"
|
"vfchsb %%v4,%%v3,%%v0\n\t"
|
||||||
"vchlg %%v5,%%v2,%%v1\n\t"
|
"vchlg %%v5,%%v2,%%v1\n\t"
|
||||||
|
|
@ -205,14 +187,13 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vesrlg %%v4,%%v4,32\n\t"
|
"vesrlg %%v4,%%v4,32\n\t"
|
||||||
"vsegf %%v4,%%v4\n\t"
|
"vsegf %%v4,%%v4\n\t"
|
||||||
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
|
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
|
||||||
|
|
||||||
"vrepf %%v2,%%v0,2\n\t"
|
"vrepf %%v2,%%v0,2\n\t"
|
||||||
"vrepg %%v3,%%v1,1\n\t"
|
"vrepg %%v3,%%v1,1\n\t"
|
||||||
"wfcsb %%v2,%%v0\n\t"
|
"wfcsb %%v2,%%v0\n\t"
|
||||||
"jne 1f\n\t"
|
"jne 1f\n\t"
|
||||||
"vstef %%v0,%1,0 \n\t"
|
"vstef %%v0,%[amin],0\n\t"
|
||||||
"vmnlg %%v0,%%v1,%%v3\n\t"
|
"vmnlg %%v0,%%v1,%%v3\n\t"
|
||||||
"vlgvg %0,%%v0,0 \n\t"
|
"vlgvg %[iamin],%%v0,0\n\t"
|
||||||
"j 2f\n\t"
|
"j 2f\n\t"
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"wfchsb %%v4,%%v0,%%v2\n\t"
|
"wfchsb %%v4,%%v0,%%v2\n\t"
|
||||||
|
|
@ -220,14 +201,15 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vsegf %%v4,%%v4\n\t"
|
"vsegf %%v4,%%v4\n\t"
|
||||||
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
||||||
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
||||||
"ste %%f0,%1 \n\t"
|
"ste %%f0,%[amin]\n\t"
|
||||||
"vlgvg %0,%%v1,0 \n\t"
|
"vlgvg %[iamin],%%v1,0\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:"=r"(iamin),"=m"(*amin)
|
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
|
||||||
);
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return iamin;
|
return iamin;
|
||||||
}
|
}
|
||||||
|
|
@ -238,7 +220,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT minf = 0.0;
|
FLOAT minf = 0.0;
|
||||||
BLASLONG min = 0;
|
BLASLONG min = 0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (min);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (min);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -248,9 +231,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
min = isamin_kernel_64(n1, x, &minf);
|
min = isamin_kernel_64(n1, x, &minf);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = ABS(x[0]);
|
minf = ABS(x[0]);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -295,7 +276,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (ABS(x[i]) < minf) {
|
if (ABS(x[i]) < minf) {
|
||||||
min = j;
|
min = j;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,12 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
|
static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) {
|
||||||
{
|
|
||||||
BLASLONG imax;
|
BLASLONG imax;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%3) \n\t"
|
|
||||||
"vleig %%v1,0,0\n\t"
|
"vleig %%v1,0,0\n\t"
|
||||||
"vleig %%v1,2,1\n\t"
|
"vleig %%v1,2,1\n\t"
|
||||||
"vleig %%v2,1,0\n\t"
|
"vleig %%v2,1,0\n\t"
|
||||||
|
|
@ -71,20 +69,18 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||||
"vleif %%v31,29,1\n\t"
|
"vleif %%v31,29,1\n\t"
|
||||||
"vleif %%v31,30,2\n\t"
|
"vleif %%v31,30,2\n\t"
|
||||||
"vleif %%v31,31,3\n\t"
|
"vleif %%v31,31,3\n\t"
|
||||||
"srlg %%r0,%2,6 \n\t"
|
"srlg %[n],%[n],6\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%3) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%3) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%3) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%3) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%3) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%3) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%3) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"vfchesb %%v5,%%v16,%%v17\n\t"
|
"vfchesb %%v5,%%v16,%%v17\n\t"
|
||||||
"vfchesb %%v6,%%v18,%%v19\n\t"
|
"vfchesb %%v6,%%v18,%%v19\n\t"
|
||||||
"vfchesb %%v7,%%v20,%%v21\n\t"
|
"vfchesb %%v7,%%v20,%%v21\n\t"
|
||||||
|
|
@ -97,14 +93,12 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||||
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
||||||
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
||||||
|
|
||||||
"vfchesb %%v20,%%v16,%%v17\n\t"
|
"vfchesb %%v20,%%v16,%%v17\n\t"
|
||||||
"vfchesb %%v21,%%v18,%%v19\n\t"
|
"vfchesb %%v21,%%v18,%%v19\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
||||||
|
|
||||||
"vfchesb %%v18,%%v16,%%v17\n\t"
|
"vfchesb %%v18,%%v16,%%v17\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
||||||
|
|
@ -112,7 +106,6 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||||
"vesrlg %%v5,%%v5,32\n\t"
|
"vesrlg %%v5,%%v5,32\n\t"
|
||||||
"vag %%v5,%%v5,%%v4\n\t"
|
"vag %%v5,%%v5,%%v4\n\t"
|
||||||
"vag %%v6,%%v6,%%v4\n\t"
|
"vag %%v6,%%v6,%%v4\n\t"
|
||||||
|
|
||||||
"vfchesb %%v7,%%v0,%%v16\n\t"
|
"vfchesb %%v7,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
||||||
"vsegf %%v8,%%v7\n\t"
|
"vsegf %%v8,%%v7\n\t"
|
||||||
|
|
@ -121,16 +114,14 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||||
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
||||||
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%3) \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,144(%%r1,%3) \n\t"
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%3) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,176(%%r1,%3) \n\t"
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%3) \n\t"
|
"vl %%v21,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,208(%%r1,%3) \n\t"
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%3) \n\t"
|
"vl %%v23,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,240(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"vfchesb %%v5,%%v16,%%v17\n\t"
|
"vfchesb %%v5,%%v16,%%v17\n\t"
|
||||||
"vfchesb %%v6,%%v18,%%v19\n\t"
|
"vfchesb %%v6,%%v18,%%v19\n\t"
|
||||||
"vfchesb %%v7,%%v20,%%v21\n\t"
|
"vfchesb %%v7,%%v20,%%v21\n\t"
|
||||||
|
|
@ -143,14 +134,12 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||||
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
||||||
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
||||||
|
|
||||||
"vfchesb %%v20,%%v16,%%v17\n\t"
|
"vfchesb %%v20,%%v16,%%v17\n\t"
|
||||||
"vfchesb %%v21,%%v18,%%v19\n\t"
|
"vfchesb %%v21,%%v18,%%v19\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
||||||
|
|
||||||
"vfchesb %%v18,%%v16,%%v17\n\t"
|
"vfchesb %%v18,%%v16,%%v17\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
||||||
|
|
@ -158,7 +147,6 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||||
"vesrlg %%v5,%%v5,32\n\t"
|
"vesrlg %%v5,%%v5,32\n\t"
|
||||||
"vag %%v5,%%v5,%%v4\n\t"
|
"vag %%v5,%%v5,%%v4\n\t"
|
||||||
"vag %%v6,%%v6,%%v4\n\t"
|
"vag %%v6,%%v6,%%v4\n\t"
|
||||||
|
|
||||||
"vfchesb %%v7,%%v0,%%v16\n\t"
|
"vfchesb %%v7,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
||||||
"vsegf %%v8,%%v7\n\t"
|
"vsegf %%v8,%%v7\n\t"
|
||||||
|
|
@ -167,10 +155,8 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||||
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
||||||
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"veslg %%v3,%%v0,32\n\t"
|
"veslg %%v3,%%v0,32\n\t"
|
||||||
"vfchsb %%v4,%%v0,%%v3\n\t"
|
"vfchsb %%v4,%%v0,%%v3\n\t"
|
||||||
"vchlg %%v5,%%v2,%%v1\n\t"
|
"vchlg %%v5,%%v2,%%v1\n\t"
|
||||||
|
|
@ -181,14 +167,13 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||||
"vesrlg %%v4,%%v4,32\n\t"
|
"vesrlg %%v4,%%v4,32\n\t"
|
||||||
"vsegf %%v4,%%v4\n\t"
|
"vsegf %%v4,%%v4\n\t"
|
||||||
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
|
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
|
||||||
|
|
||||||
"vrepf %%v2,%%v0,2\n\t"
|
"vrepf %%v2,%%v0,2\n\t"
|
||||||
"vrepg %%v3,%%v1,1\n\t"
|
"vrepg %%v3,%%v1,1\n\t"
|
||||||
"wfcsb %%v2,%%v0\n\t"
|
"wfcsb %%v2,%%v0\n\t"
|
||||||
"jne 1f\n\t"
|
"jne 1f\n\t"
|
||||||
"vstef %%v0,%1,0 \n\t"
|
"vstef %%v0,%[max],0\n\t"
|
||||||
"vmnlg %%v0,%%v1,%%v3\n\t"
|
"vmnlg %%v0,%%v1,%%v3\n\t"
|
||||||
"vlgvg %0,%%v0,0 \n\t"
|
"vlgvg %[imax],%%v0,0\n\t"
|
||||||
"j 2f\n\t"
|
"j 2f\n\t"
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"wfchsb %%v4,%%v2,%%v0\n\t"
|
"wfchsb %%v4,%%v2,%%v0\n\t"
|
||||||
|
|
@ -196,14 +181,15 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
|
||||||
"vsegf %%v4,%%v4\n\t"
|
"vsegf %%v4,%%v4\n\t"
|
||||||
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
||||||
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
||||||
"ste %%f0,%1 \n\t"
|
"ste %%f0,%[max]\n\t"
|
||||||
"vlgvg %0,%%v1,0 \n\t"
|
"vlgvg %[imax],%%v1,0\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:"=r"(imax),"=m"(*max)
|
: [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
|
||||||
);
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return imax;
|
return imax;
|
||||||
}
|
}
|
||||||
|
|
@ -214,7 +200,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT maxf = 0.0;
|
FLOAT maxf = 0.0;
|
||||||
BLASLONG max = 0;
|
BLASLONG max = 0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (max);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (max);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -224,9 +211,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
max = ismax_kernel_64(n1, x, &maxf);
|
max = ismax_kernel_64(n1, x, &maxf);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = x[0];
|
maxf = x[0];
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -271,7 +256,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (x[i] > maxf) {
|
if (x[i] > maxf) {
|
||||||
max = j;
|
max = j;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,12 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
|
static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) {
|
||||||
{
|
|
||||||
BLASLONG imin;
|
BLASLONG imin;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%3) \n\t"
|
|
||||||
"vleig %%v1,0,0\n\t"
|
"vleig %%v1,0,0\n\t"
|
||||||
"vleig %%v1,2,1\n\t"
|
"vleig %%v1,2,1\n\t"
|
||||||
"vleig %%v2,1,0\n\t"
|
"vleig %%v2,1,0\n\t"
|
||||||
|
|
@ -71,20 +69,18 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||||
"vleif %%v31,29,1\n\t"
|
"vleif %%v31,29,1\n\t"
|
||||||
"vleif %%v31,30,2\n\t"
|
"vleif %%v31,30,2\n\t"
|
||||||
"vleif %%v31,31,3\n\t"
|
"vleif %%v31,31,3\n\t"
|
||||||
"srlg %%r0,%2,6 \n\t"
|
"srlg %[n],%[n],6\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%3) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%3) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%3) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%3) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%3) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%3) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%3) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"vfchesb %%v5,%%v17,%%v16\n\t"
|
"vfchesb %%v5,%%v17,%%v16\n\t"
|
||||||
"vfchesb %%v6,%%v19,%%v18\n\t"
|
"vfchesb %%v6,%%v19,%%v18\n\t"
|
||||||
"vfchesb %%v7,%%v21,%%v20\n\t"
|
"vfchesb %%v7,%%v21,%%v20\n\t"
|
||||||
|
|
@ -97,14 +93,12 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||||
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
||||||
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
||||||
|
|
||||||
"vfchesb %%v20,%%v17,%%v16\n\t"
|
"vfchesb %%v20,%%v17,%%v16\n\t"
|
||||||
"vfchesb %%v21,%%v19,%%v18\n\t"
|
"vfchesb %%v21,%%v19,%%v18\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
||||||
|
|
||||||
"vfchesb %%v18,%%v17,%%v16\n\t"
|
"vfchesb %%v18,%%v17,%%v16\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
||||||
|
|
@ -112,7 +106,6 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||||
"vesrlg %%v5,%%v5,32\n\t"
|
"vesrlg %%v5,%%v5,32\n\t"
|
||||||
"vag %%v5,%%v5,%%v4\n\t"
|
"vag %%v5,%%v5,%%v4\n\t"
|
||||||
"vag %%v6,%%v6,%%v4\n\t"
|
"vag %%v6,%%v6,%%v4\n\t"
|
||||||
|
|
||||||
"vfchesb %%v7,%%v16,%%v0\n\t"
|
"vfchesb %%v7,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
||||||
"vsegf %%v8,%%v7\n\t"
|
"vsegf %%v8,%%v7\n\t"
|
||||||
|
|
@ -121,16 +114,14 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||||
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
||||||
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%3) \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,144(%%r1,%3) \n\t"
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,160(%%r1,%3) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,176(%%r1,%3) \n\t"
|
"vl %%v20,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,192(%%r1,%3) \n\t"
|
"vl %%v21,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,208(%%r1,%3) \n\t"
|
"vl %%v22,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,224(%%r1,%3) \n\t"
|
"vl %%v23,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,240(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"vfchesb %%v5,%%v17,%%v16\n\t"
|
"vfchesb %%v5,%%v17,%%v16\n\t"
|
||||||
"vfchesb %%v6,%%v19,%%v18\n\t"
|
"vfchesb %%v6,%%v19,%%v18\n\t"
|
||||||
"vfchesb %%v7,%%v21,%%v20\n\t"
|
"vfchesb %%v7,%%v21,%%v20\n\t"
|
||||||
|
|
@ -143,14 +134,12 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||||
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
"vsel %%v7,%%v28,%%v29,%%v7\n\t"
|
||||||
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
"vsel %%v19,%%v22,%%v23,%%v8\n\t"
|
||||||
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
"vsel %%v8,%%v30,%%v31,%%v8\n\t"
|
||||||
|
|
||||||
"vfchesb %%v20,%%v17,%%v16\n\t"
|
"vfchesb %%v20,%%v17,%%v16\n\t"
|
||||||
"vfchesb %%v21,%%v19,%%v18\n\t"
|
"vfchesb %%v21,%%v19,%%v18\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v20\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v20\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v21\n\t"
|
||||||
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
"vsel %%v6,%%v7,%%v8,%%v21\n\t"
|
||||||
|
|
||||||
"vfchesb %%v18,%%v17,%%v16\n\t"
|
"vfchesb %%v18,%%v17,%%v16\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
"vsel %%v5,%%v5,%%v6,%%v18\n\t"
|
||||||
|
|
@ -158,7 +147,6 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||||
"vesrlg %%v5,%%v5,32\n\t"
|
"vesrlg %%v5,%%v5,32\n\t"
|
||||||
"vag %%v5,%%v5,%%v4\n\t"
|
"vag %%v5,%%v5,%%v4\n\t"
|
||||||
"vag %%v6,%%v6,%%v4\n\t"
|
"vag %%v6,%%v6,%%v4\n\t"
|
||||||
|
|
||||||
"vfchesb %%v7,%%v16,%%v0\n\t"
|
"vfchesb %%v7,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v7\n\t"
|
||||||
"vsegf %%v8,%%v7\n\t"
|
"vsegf %%v8,%%v7\n\t"
|
||||||
|
|
@ -167,10 +155,8 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||||
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
"vsel %%v1,%%v1,%%v5,%%v7\n\t"
|
||||||
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
"vsel %%v2,%%v2,%%v6,%%v8\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"veslg %%v3,%%v0,32\n\t"
|
"veslg %%v3,%%v0,32\n\t"
|
||||||
"vfchsb %%v4,%%v3,%%v0\n\t"
|
"vfchsb %%v4,%%v3,%%v0\n\t"
|
||||||
"vchlg %%v5,%%v2,%%v1\n\t"
|
"vchlg %%v5,%%v2,%%v1\n\t"
|
||||||
|
|
@ -181,14 +167,13 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||||
"vesrlg %%v4,%%v4,32\n\t"
|
"vesrlg %%v4,%%v4,32\n\t"
|
||||||
"vsegf %%v4,%%v4\n\t"
|
"vsegf %%v4,%%v4\n\t"
|
||||||
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
|
"vsel %%v1,%%v1,%%v2,%%v4\n\t"
|
||||||
|
|
||||||
"vrepf %%v2,%%v0,2\n\t"
|
"vrepf %%v2,%%v0,2\n\t"
|
||||||
"vrepg %%v3,%%v1,1\n\t"
|
"vrepg %%v3,%%v1,1\n\t"
|
||||||
"wfcsb %%v2,%%v0\n\t"
|
"wfcsb %%v2,%%v0\n\t"
|
||||||
"jne 1f\n\t"
|
"jne 1f\n\t"
|
||||||
"vstef %%v0,%1,0 \n\t"
|
"vstef %%v0,%[min],0\n\t"
|
||||||
"vmnlg %%v0,%%v1,%%v3\n\t"
|
"vmnlg %%v0,%%v1,%%v3\n\t"
|
||||||
"vlgvg %0,%%v0,0 \n\t"
|
"vlgvg %[imin],%%v0,0\n\t"
|
||||||
"j 2f\n\t"
|
"j 2f\n\t"
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"wfchsb %%v4,%%v0,%%v2\n\t"
|
"wfchsb %%v4,%%v0,%%v2\n\t"
|
||||||
|
|
@ -196,14 +181,15 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
|
||||||
"vsegf %%v4,%%v4\n\t"
|
"vsegf %%v4,%%v4\n\t"
|
||||||
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
||||||
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
||||||
"ste %%f0,%1 \n\t"
|
"ste %%f0,%[min]\n\t"
|
||||||
"vlgvg %0,%%v1,0 \n\t"
|
"vlgvg %[imin],%%v1,0\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:"=r"(imin),"=m"(*min)
|
: [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16",
|
||||||
);
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return imin;
|
return imin;
|
||||||
}
|
}
|
||||||
|
|
@ -214,7 +200,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT minf = 0.0;
|
FLOAT minf = 0.0;
|
||||||
BLASLONG min = 0;
|
BLASLONG min = 0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (min);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (min);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -224,9 +211,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
min = ismin_kernel_64(n1, x, &minf);
|
min = ismin_kernel_64(n1, x, &minf);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = x[0];
|
minf = x[0];
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -271,7 +256,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (x[i] < minf) {
|
if (x[i] < minf) {
|
||||||
min = j;
|
min = j;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2017, The OpenBLAS Project
|
Copyright (c) 2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,22 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
|
||||||
|
|
||||||
static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
|
static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) {
|
||||||
{
|
|
||||||
BLASLONG iamax;
|
BLASLONG iamax;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vleg %%v0,0(%[x]),0\n\t"
|
||||||
"vleg %%v0,0(%3),0 \n\t"
|
"vleg %%v1,8(%[x]),0\n\t"
|
||||||
"vleg %%v1,8(%3),0 \n\t"
|
"vleg %%v0,16(%[x]),1\n\t"
|
||||||
"vleg %%v0,16(%3),1 \n\t"
|
"vleg %%v1,24(%[x]),1\n\t"
|
||||||
"vleg %%v1,24(%3),1 \n\t"
|
|
||||||
"vflpdb %%v0,%%v0\n\t"
|
"vflpdb %%v0,%%v0\n\t"
|
||||||
"vflpdb %%v1,%%v1\n\t"
|
"vflpdb %%v1,%%v1\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v1\n\t"
|
"vfadb %%v0,%%v0,%%v1\n\t"
|
||||||
|
|
@ -59,27 +52,26 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vleig %%v26,5,1\n\t"
|
"vleig %%v26,5,1\n\t"
|
||||||
"vleig %%v27,6,0\n\t"
|
"vleig %%v27,6,0\n\t"
|
||||||
"vleig %%v27,7,1\n\t"
|
"vleig %%v27,7,1\n\t"
|
||||||
"srlg %%r0,%2,4 \n\t"
|
"srlg %[n],%[n],4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vleg %%v16,0(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v16,0(%%r1,%3),0 \n\t"
|
"vleg %%v17,8(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v17,8(%%r1,%3),0 \n\t"
|
"vleg %%v16,16(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v16,16(%%r1,%3),1 \n\t"
|
"vleg %%v17,24(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v17,24(%%r1,%3),1 \n\t"
|
"vleg %%v18,32(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v18,32(%%r1,%3),0 \n\t"
|
"vleg %%v19,40(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v19,40(%%r1,%3),0 \n\t"
|
"vleg %%v18,48(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v18,48(%%r1,%3),1 \n\t"
|
"vleg %%v19,56(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v19,56(%%r1,%3),1 \n\t"
|
"vleg %%v20,64(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v20,64(%%r1,%3),0 \n\t"
|
"vleg %%v21,72(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v21,72(%%r1,%3),0 \n\t"
|
"vleg %%v20,80(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v20,80(%%r1,%3),1 \n\t"
|
"vleg %%v21,88(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v21,88(%%r1,%3),1 \n\t"
|
"vleg %%v22,96(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v22,96(%%r1,%3),0 \n\t"
|
"vleg %%v23,104(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v23,104(%%r1,%3),0 \n\t"
|
"vleg %%v22,112(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v22,112(%%r1,%3),1 \n\t"
|
"vleg %%v23,120(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v23,120(%%r1,%3),1 \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -92,40 +84,36 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vfadb %%v17,%%v18,%%v19\n\t"
|
"vfadb %%v17,%%v18,%%v19\n\t"
|
||||||
"vfadb %%v18,%%v20,%%v21\n\t"
|
"vfadb %%v18,%%v20,%%v21\n\t"
|
||||||
"vfadb %%v19,%%v22,%%v23\n\t"
|
"vfadb %%v19,%%v22,%%v23\n\t"
|
||||||
|
|
||||||
"vfchedb %%v4,%%v16,%%v17\n\t"
|
"vfchedb %%v4,%%v16,%%v17\n\t"
|
||||||
"vfchedb %%v5,%%v18,%%v19\n\t"
|
"vfchedb %%v5,%%v18,%%v19\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
|
||||||
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
|
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
|
||||||
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
|
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
|
||||||
|
|
||||||
"vfchedb %%v18,%%v16,%%v17\n\t"
|
"vfchedb %%v18,%%v16,%%v17\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"vfchedb %%v5,%%v0,%%v16\n\t"
|
"vfchedb %%v5,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
||||||
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
||||||
"vag %%v3,%%v3,%%v2\n\t"
|
"vag %%v3,%%v3,%%v2\n\t"
|
||||||
|
"vleg %%v16,128(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v16,128(%%r1,%3),0 \n\t"
|
"vleg %%v17,136(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v17,136(%%r1,%3),0 \n\t"
|
"vleg %%v16,144(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v16,144(%%r1,%3),1 \n\t"
|
"vleg %%v17,152(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v17,152(%%r1,%3),1 \n\t"
|
"vleg %%v18,160(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v18,160(%%r1,%3),0 \n\t"
|
"vleg %%v19,168(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v19,168(%%r1,%3),0 \n\t"
|
"vleg %%v18,176(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v18,176(%%r1,%3),1 \n\t"
|
"vleg %%v19,184(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v19,184(%%r1,%3),1 \n\t"
|
"vleg %%v20,192(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v20,192(%%r1,%3),0 \n\t"
|
"vleg %%v21,200(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v21,200(%%r1,%3),0 \n\t"
|
"vleg %%v20,208(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v20,208(%%r1,%3),1 \n\t"
|
"vleg %%v21,216(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v21,216(%%r1,%3),1 \n\t"
|
"vleg %%v22,224(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v22,224(%%r1,%3),0 \n\t"
|
"vleg %%v23,232(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v23,232(%%r1,%3),0 \n\t"
|
"vleg %%v22,240(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v22,240(%%r1,%3),1 \n\t"
|
"vleg %%v23,248(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v23,248(%%r1,%3),1 \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -138,60 +126,55 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
|
||||||
"vfadb %%v17,%%v18,%%v19\n\t"
|
"vfadb %%v17,%%v18,%%v19\n\t"
|
||||||
"vfadb %%v18,%%v20,%%v21\n\t"
|
"vfadb %%v18,%%v20,%%v21\n\t"
|
||||||
"vfadb %%v19,%%v22,%%v23\n\t"
|
"vfadb %%v19,%%v22,%%v23\n\t"
|
||||||
|
|
||||||
"vfchedb %%v4,%%v16,%%v17\n\t"
|
"vfchedb %%v4,%%v16,%%v17\n\t"
|
||||||
"vfchedb %%v5,%%v18,%%v19\n\t"
|
"vfchedb %%v5,%%v18,%%v19\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
|
||||||
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
|
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
|
||||||
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
|
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
|
||||||
|
|
||||||
"vfchedb %%v18,%%v16,%%v17\n\t"
|
"vfchedb %%v18,%%v16,%%v17\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"vfchedb %%v5,%%v0,%%v16\n\t"
|
"vfchedb %%v5,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
||||||
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
||||||
"vag %%v3,%%v3,%%v2\n\t"
|
"vag %%v3,%%v3,%%v2\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v2,%%v0,1\n\t"
|
"vrepg %%v2,%%v0,1\n\t"
|
||||||
"vrepg %%v3,%%v1,1\n\t"
|
"vrepg %%v3,%%v1,1\n\t"
|
||||||
"wfcdb %%v2,%%v0\n\t"
|
"wfcdb %%v2,%%v0\n\t"
|
||||||
"jne 1f\n\t"
|
"jne 1f\n\t"
|
||||||
"vsteg %%v0,%1,0 \n\t"
|
"vsteg %%v0,%[amax],0\n\t"
|
||||||
"vmnlg %%v0,%%v1,%%v3\n\t"
|
"vmnlg %%v0,%%v1,%%v3\n\t"
|
||||||
"vlgvg %0,%%v0,0 \n\t"
|
"vlgvg %[iamax],%%v0,0\n\t"
|
||||||
"j 2f\n\t"
|
"j 2f\n\t"
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"wfchdb %%v4,%%v2,%%v0\n\t"
|
"wfchdb %%v4,%%v2,%%v0\n\t"
|
||||||
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
||||||
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
||||||
"std %%f0,%1 \n\t"
|
"std %%f0,%[amax]\n\t"
|
||||||
"vlgvg %0,%%v1,0 \n\t"
|
"vlgvg %[iamax],%%v1,0\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:"=r"(iamax),"=m"(*amax)
|
: [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
|
||||||
);
|
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
|
||||||
|
|
||||||
return iamax;
|
return iamax;
|
||||||
}
|
}
|
||||||
|
|
||||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
{
|
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0;
|
BLASLONG ix = 0;
|
||||||
FLOAT maxf = 0;
|
FLOAT maxf = 0;
|
||||||
BLASLONG max = 0;
|
BLASLONG max = 0;
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return(max);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (max);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -201,18 +184,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
max = izamax_kernel_16(n1, x, &maxf);
|
max = izamax_kernel_16(n1, x, &maxf);
|
||||||
ix = n1 * 2;
|
ix = n1 * 2;
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = CABS1(x, 0);
|
maxf = CABS1(x, 0);
|
||||||
ix += 2;
|
ix += 2;
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
if (CABS1(x, ix) > maxf) {
|
||||||
if( CABS1(x,ix) > maxf )
|
|
||||||
{
|
|
||||||
max = i;
|
max = i;
|
||||||
maxf = CABS1(x, ix);
|
maxf = CABS1(x, ix);
|
||||||
}
|
}
|
||||||
|
|
@ -226,13 +205,35 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
max = 0;
|
max = 0;
|
||||||
maxf = CABS1(x, 0);
|
maxf = CABS1(x, 0);
|
||||||
inc_x2 = 2 * inc_x;
|
inc_x2 = 2 * inc_x;
|
||||||
ix += inc_x2;
|
|
||||||
i++;
|
|
||||||
|
|
||||||
while(i < n)
|
BLASLONG n1 = n & -4;
|
||||||
{
|
while (i < n1) {
|
||||||
if( CABS1(x,ix) > maxf )
|
|
||||||
{
|
if (CABS1(x, ix) > maxf) {
|
||||||
|
max = i;
|
||||||
|
maxf = CABS1(x, ix);
|
||||||
|
}
|
||||||
|
if (CABS1(x, ix + inc_x2) > maxf) {
|
||||||
|
max = i + 1;
|
||||||
|
maxf = CABS1(x, ix + inc_x2);
|
||||||
|
}
|
||||||
|
if (CABS1(x, ix + 2 * inc_x2) > maxf) {
|
||||||
|
max = i + 2;
|
||||||
|
maxf = CABS1(x, ix + 2 * inc_x2);
|
||||||
|
}
|
||||||
|
if (CABS1(x, ix + 3 * inc_x2) > maxf) {
|
||||||
|
max = i + 3;
|
||||||
|
maxf = CABS1(x, ix + 3 * inc_x2);
|
||||||
|
}
|
||||||
|
|
||||||
|
ix += inc_x2 * 4;
|
||||||
|
|
||||||
|
i += 4;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
while (i < n) {
|
||||||
|
if (CABS1(x, ix) > maxf) {
|
||||||
max = i;
|
max = i;
|
||||||
maxf = CABS1(x, ix);
|
maxf = CABS1(x, ix);
|
||||||
}
|
}
|
||||||
|
|
@ -242,5 +243,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
return (max + 1);
|
return (max + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2017, The OpenBLAS Project
|
Copyright (c) 2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,22 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
|
||||||
|
|
||||||
static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
|
static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) {
|
||||||
{
|
|
||||||
BLASLONG iamin;
|
BLASLONG iamin;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vleg %%v0,0(%[x]),0\n\t"
|
||||||
"vleg %%v0,0(%3),0 \n\t"
|
"vleg %%v1,8(%[x]),0\n\t"
|
||||||
"vleg %%v1,8(%3),0 \n\t"
|
"vleg %%v0,16(%[x]),1\n\t"
|
||||||
"vleg %%v0,16(%3),1 \n\t"
|
"vleg %%v1,24(%[x]),1\n\t"
|
||||||
"vleg %%v1,24(%3),1 \n\t"
|
|
||||||
"vflpdb %%v0,%%v0\n\t"
|
"vflpdb %%v0,%%v0\n\t"
|
||||||
"vflpdb %%v1,%%v1\n\t"
|
"vflpdb %%v1,%%v1\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v1\n\t"
|
"vfadb %%v0,%%v0,%%v1\n\t"
|
||||||
|
|
@ -59,27 +52,26 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vleig %%v26,5,1\n\t"
|
"vleig %%v26,5,1\n\t"
|
||||||
"vleig %%v27,6,0\n\t"
|
"vleig %%v27,6,0\n\t"
|
||||||
"vleig %%v27,7,1\n\t"
|
"vleig %%v27,7,1\n\t"
|
||||||
"srlg %%r0,%2,4 \n\t"
|
"srlg %[n],%[n],4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%3) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vleg %%v16,0(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v16,0(%%r1,%3),0 \n\t"
|
"vleg %%v17,8(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v17,8(%%r1,%3),0 \n\t"
|
"vleg %%v16,16(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v16,16(%%r1,%3),1 \n\t"
|
"vleg %%v17,24(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v17,24(%%r1,%3),1 \n\t"
|
"vleg %%v18,32(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v18,32(%%r1,%3),0 \n\t"
|
"vleg %%v19,40(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v19,40(%%r1,%3),0 \n\t"
|
"vleg %%v18,48(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v18,48(%%r1,%3),1 \n\t"
|
"vleg %%v19,56(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v19,56(%%r1,%3),1 \n\t"
|
"vleg %%v20,64(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v20,64(%%r1,%3),0 \n\t"
|
"vleg %%v21,72(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v21,72(%%r1,%3),0 \n\t"
|
"vleg %%v20,80(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v20,80(%%r1,%3),1 \n\t"
|
"vleg %%v21,88(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v21,88(%%r1,%3),1 \n\t"
|
"vleg %%v22,96(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v22,96(%%r1,%3),0 \n\t"
|
"vleg %%v23,104(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v23,104(%%r1,%3),0 \n\t"
|
"vleg %%v22,112(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v22,112(%%r1,%3),1 \n\t"
|
"vleg %%v23,120(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v23,120(%%r1,%3),1 \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -92,40 +84,36 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vfadb %%v17,%%v18,%%v19\n\t"
|
"vfadb %%v17,%%v18,%%v19\n\t"
|
||||||
"vfadb %%v18,%%v20,%%v21\n\t"
|
"vfadb %%v18,%%v20,%%v21\n\t"
|
||||||
"vfadb %%v19,%%v22,%%v23\n\t"
|
"vfadb %%v19,%%v22,%%v23\n\t"
|
||||||
|
|
||||||
"vfchedb %%v4,%%v17,%%v16\n\t"
|
"vfchedb %%v4,%%v17,%%v16\n\t"
|
||||||
"vfchedb %%v5,%%v19,%%v18\n\t"
|
"vfchedb %%v5,%%v19,%%v18\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
|
||||||
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
|
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
|
||||||
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
|
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
|
||||||
|
|
||||||
"vfchedb %%v18,%%v17,%%v16\n\t"
|
"vfchedb %%v18,%%v17,%%v16\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"vfchedb %%v5,%%v16,%%v0\n\t"
|
"vfchedb %%v5,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
||||||
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
||||||
"vag %%v3,%%v3,%%v2\n\t"
|
"vag %%v3,%%v3,%%v2\n\t"
|
||||||
|
"vleg %%v16,128(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v16,128(%%r1,%3),0 \n\t"
|
"vleg %%v17,136(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v17,136(%%r1,%3),0 \n\t"
|
"vleg %%v16,144(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v16,144(%%r1,%3),1 \n\t"
|
"vleg %%v17,152(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v17,152(%%r1,%3),1 \n\t"
|
"vleg %%v18,160(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v18,160(%%r1,%3),0 \n\t"
|
"vleg %%v19,168(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v19,168(%%r1,%3),0 \n\t"
|
"vleg %%v18,176(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v18,176(%%r1,%3),1 \n\t"
|
"vleg %%v19,184(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v19,184(%%r1,%3),1 \n\t"
|
"vleg %%v20,192(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v20,192(%%r1,%3),0 \n\t"
|
"vleg %%v21,200(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v21,200(%%r1,%3),0 \n\t"
|
"vleg %%v20,208(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v20,208(%%r1,%3),1 \n\t"
|
"vleg %%v21,216(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v21,216(%%r1,%3),1 \n\t"
|
"vleg %%v22,224(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v22,224(%%r1,%3),0 \n\t"
|
"vleg %%v23,232(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v23,232(%%r1,%3),0 \n\t"
|
"vleg %%v22,240(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v22,240(%%r1,%3),1 \n\t"
|
"vleg %%v23,248(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v23,248(%%r1,%3),1 \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -138,60 +126,55 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
|
||||||
"vfadb %%v17,%%v18,%%v19\n\t"
|
"vfadb %%v17,%%v18,%%v19\n\t"
|
||||||
"vfadb %%v18,%%v20,%%v21\n\t"
|
"vfadb %%v18,%%v20,%%v21\n\t"
|
||||||
"vfadb %%v19,%%v22,%%v23\n\t"
|
"vfadb %%v19,%%v22,%%v23\n\t"
|
||||||
|
|
||||||
"vfchedb %%v4,%%v17,%%v16\n\t"
|
"vfchedb %%v4,%%v17,%%v16\n\t"
|
||||||
"vfchedb %%v5,%%v19,%%v18\n\t"
|
"vfchedb %%v5,%%v19,%%v18\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v4\n\t"
|
||||||
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
|
"vsel %%v4,%%v24,%%v25,%%v4\n\t"
|
||||||
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
|
"vsel %%v17,%%v18,%%v19,%%v5\n\t"
|
||||||
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
|
"vsel %%v5,%%v26,%%v27,%%v5\n\t"
|
||||||
|
|
||||||
"vfchedb %%v18,%%v17,%%v16\n\t"
|
"vfchedb %%v18,%%v17,%%v16\n\t"
|
||||||
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
"vsel %%v16,%%v16,%%v17,%%v18\n\t"
|
||||||
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
"vsel %%v4,%%v4,%%v5,%%v18\n\t"
|
||||||
"vag %%v4,%%v4,%%v3\n\t"
|
"vag %%v4,%%v4,%%v3\n\t"
|
||||||
|
|
||||||
"vfchedb %%v5,%%v16,%%v0\n\t"
|
"vfchedb %%v5,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v5\n\t"
|
||||||
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
"vsel %%v1,%%v1,%%v4,%%v5\n\t"
|
||||||
"vag %%v3,%%v3,%%v2\n\t"
|
"vag %%v3,%%v3,%%v2\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v2,%%v0,1\n\t"
|
"vrepg %%v2,%%v0,1\n\t"
|
||||||
"vrepg %%v3,%%v1,1\n\t"
|
"vrepg %%v3,%%v1,1\n\t"
|
||||||
"wfcdb %%v2,%%v0\n\t"
|
"wfcdb %%v2,%%v0\n\t"
|
||||||
"jne 1f\n\t"
|
"jne 1f\n\t"
|
||||||
"vsteg %%v0,%1,0 \n\t"
|
"vsteg %%v0,%[amin],0\n\t"
|
||||||
"vmnlg %%v0,%%v1,%%v3\n\t"
|
"vmnlg %%v0,%%v1,%%v3\n\t"
|
||||||
"vlgvg %0,%%v0,0 \n\t"
|
"vlgvg %[iamin],%%v0,0\n\t"
|
||||||
"j 2f\n\t"
|
"j 2f\n\t"
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"wfchdb %%v4,%%v0,%%v2\n\t"
|
"wfchdb %%v4,%%v0,%%v2\n\t"
|
||||||
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
"vsel %%v1,%%v3,%%v1,%%v4\n\t"
|
||||||
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
"vsel %%v0,%%v2,%%v0,%%v4\n\t"
|
||||||
"std %%f0,%1 \n\t"
|
"std %%f0,%[amin]\n\t"
|
||||||
"vlgvg %0,%%v1,0 \n\t"
|
"vlgvg %[iamin],%%v1,0\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:"=r"(iamin),"=m"(*amin)
|
: [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
|
||||||
);
|
"v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
|
||||||
|
|
||||||
return iamin;
|
return iamin;
|
||||||
}
|
}
|
||||||
|
|
||||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
{
|
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0;
|
BLASLONG ix = 0;
|
||||||
FLOAT minf = 0;
|
FLOAT minf = 0;
|
||||||
BLASLONG min = 0;
|
BLASLONG min = 0;
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return(min);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (min);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -201,18 +184,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
min = izamin_kernel_16(n1, x, &minf);
|
min = izamin_kernel_16(n1, x, &minf);
|
||||||
ix = n1 * 2;
|
ix = n1 * 2;
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = CABS1(x, 0);
|
minf = CABS1(x, 0);
|
||||||
ix += 2;
|
ix += 2;
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
if (CABS1(x, ix) < minf) {
|
||||||
if( CABS1(x,ix) < minf )
|
|
||||||
{
|
|
||||||
min = i;
|
min = i;
|
||||||
minf = CABS1(x, ix);
|
minf = CABS1(x, ix);
|
||||||
}
|
}
|
||||||
|
|
@ -226,13 +205,35 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
min = 0;
|
min = 0;
|
||||||
minf = CABS1(x, 0);
|
minf = CABS1(x, 0);
|
||||||
inc_x2 = 2 * inc_x;
|
inc_x2 = 2 * inc_x;
|
||||||
ix += inc_x2;
|
|
||||||
i++;
|
|
||||||
|
|
||||||
while(i < n)
|
BLASLONG n1 = n & -4;
|
||||||
{
|
while (i < n1) {
|
||||||
if( CABS1(x,ix) < minf )
|
|
||||||
{
|
if (CABS1(x, ix) < minf) {
|
||||||
|
min = i;
|
||||||
|
minf = CABS1(x, ix);
|
||||||
|
}
|
||||||
|
if (CABS1(x, ix + inc_x2) < minf) {
|
||||||
|
min = i + 1;
|
||||||
|
minf = CABS1(x, ix + inc_x2);
|
||||||
|
}
|
||||||
|
if (CABS1(x, ix + 2 * inc_x2) < minf) {
|
||||||
|
min = i + 2;
|
||||||
|
minf = CABS1(x, ix + 2 * inc_x2);
|
||||||
|
}
|
||||||
|
if (CABS1(x, ix + 3 * inc_x2) < minf) {
|
||||||
|
min = i + 3;
|
||||||
|
minf = CABS1(x, ix + 3 * inc_x2);
|
||||||
|
}
|
||||||
|
|
||||||
|
ix += inc_x2 * 4;
|
||||||
|
|
||||||
|
i += 4;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
while (i < n) {
|
||||||
|
if (CABS1(x, ix) < minf) {
|
||||||
min = i;
|
min = i;
|
||||||
minf = CABS1(x, ix);
|
minf = CABS1(x, ix);
|
||||||
}
|
}
|
||||||
|
|
@ -242,5 +243,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
return (min + 1);
|
return (min + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,40 +28,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
#define ABS fabsf
|
||||||
#endif
|
|
||||||
|
|
||||||
static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x)
|
static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT amax;
|
FLOAT amax;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%2) \n\t"
|
"srlg %[n],%[n],6\n\t"
|
||||||
"srlg %%r0,%1,6 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
"vl %%v24,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24,128(%%r1,%2) \n\t"
|
"vl %%v25,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v25,144(%%r1,%2) \n\t"
|
"vl %%v26,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26,160(%%r1,%2) \n\t"
|
"vl %%v27,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v27,176(%%r1,%2) \n\t"
|
"vl %%v28,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,192(%%r1,%2) \n\t"
|
"vl %%v29,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v29,208(%%r1,%2) \n\t"
|
"vl %%v30,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,224(%%r1,%2) \n\t"
|
"vl %%v31,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31,240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmaxsb %%v16,%%v16,%%v24,8\n\t"
|
"vfmaxsb %%v16,%%v16,%%v24,8\n\t"
|
||||||
"vfmaxsb %%v17,%%v17,%%v25,8\n\t"
|
"vfmaxsb %%v17,%%v17,%%v25,8\n\t"
|
||||||
"vfmaxsb %%v18,%%v18,%%v26,8\n\t"
|
"vfmaxsb %%v18,%%v18,%%v26,8\n\t"
|
||||||
|
|
@ -70,32 +62,25 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x)
|
||||||
"vfmaxsb %%v21,%%v21,%%v29,8\n\t"
|
"vfmaxsb %%v21,%%v21,%%v29,8\n\t"
|
||||||
"vfmaxsb %%v22,%%v22,%%v30,8\n\t"
|
"vfmaxsb %%v22,%%v22,%%v30,8\n\t"
|
||||||
"vfmaxsb %%v23,%%v23,%%v31,8\n\t"
|
"vfmaxsb %%v23,%%v23,%%v31,8\n\t"
|
||||||
|
|
||||||
"vfmaxsb %%v16,%%v16,%%v20,8\n\t"
|
"vfmaxsb %%v16,%%v16,%%v20,8\n\t"
|
||||||
"vfmaxsb %%v17,%%v17,%%v21,8\n\t"
|
"vfmaxsb %%v17,%%v17,%%v21,8\n\t"
|
||||||
"vfmaxsb %%v18,%%v18,%%v22,8\n\t"
|
"vfmaxsb %%v18,%%v18,%%v22,8\n\t"
|
||||||
"vfmaxsb %%v19,%%v19,%%v23,8\n\t"
|
"vfmaxsb %%v19,%%v19,%%v23,8\n\t"
|
||||||
|
|
||||||
"vfmaxsb %%v16,%%v16,%%v18,8\n\t"
|
"vfmaxsb %%v16,%%v16,%%v18,8\n\t"
|
||||||
"vfmaxsb %%v17,%%v17,%%v19,8\n\t"
|
"vfmaxsb %%v17,%%v17,%%v19,8\n\t"
|
||||||
|
|
||||||
"vfmaxsb %%v16,%%v16,%%v17,8\n\t"
|
"vfmaxsb %%v16,%%v16,%%v17,8\n\t"
|
||||||
|
|
||||||
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
|
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"veslg %%v16,%%v0,32\n\t"
|
"veslg %%v16,%%v0,32\n\t"
|
||||||
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
|
"vfmaxsb %%v0,%%v0,%%v16,8\n\t"
|
||||||
|
|
||||||
"vrepf %%v16,%%v0,2\n\t"
|
"vrepf %%v16,%%v0,2\n\t"
|
||||||
"wfmaxsb %%v0,%%v0,%%v16,8\n\t"
|
"wfmaxsb %%v0,%%v0,%%v16,8\n\t"
|
||||||
"lper %0,%%f0 "
|
"lper %[amax],%%f0"
|
||||||
:"=f"(amax)
|
: [amax] "=f"(amax),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return amax;
|
return amax;
|
||||||
}
|
}
|
||||||
|
|
@ -105,7 +90,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
BLASLONG j = 0;
|
BLASLONG j = 0;
|
||||||
FLOAT maxf = 0.0;
|
FLOAT maxf = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (maxf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -115,9 +101,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
maxf = samax_kernel_64(n1, x);
|
maxf = samax_kernel_64(n1, x);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = ABS(x[0]);
|
maxf = ABS(x[0]);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -156,7 +140,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (ABS(x[i]) > maxf) {
|
if (ABS(x[i]) > maxf) {
|
||||||
maxf = ABS(x[i]);
|
maxf = ABS(x[i]);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,40 +28,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
#define ABS fabsf
|
||||||
#endif
|
|
||||||
|
|
||||||
static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x)
|
static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT amin;
|
FLOAT amin;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%2) \n\t"
|
"srlg %[n],%[n],6\n\t"
|
||||||
"srlg %%r0,%1,6 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
"vl %%v24,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24,128(%%r1,%2) \n\t"
|
"vl %%v25,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v25,144(%%r1,%2) \n\t"
|
"vl %%v26,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26,160(%%r1,%2) \n\t"
|
"vl %%v27,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v27,176(%%r1,%2) \n\t"
|
"vl %%v28,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,192(%%r1,%2) \n\t"
|
"vl %%v29,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v29,208(%%r1,%2) \n\t"
|
"vl %%v30,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,224(%%r1,%2) \n\t"
|
"vl %%v31,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31,240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfminsb %%v16,%%v16,%%v24,8\n\t"
|
"vfminsb %%v16,%%v16,%%v24,8\n\t"
|
||||||
"vfminsb %%v17,%%v17,%%v25,8\n\t"
|
"vfminsb %%v17,%%v17,%%v25,8\n\t"
|
||||||
"vfminsb %%v18,%%v18,%%v26,8\n\t"
|
"vfminsb %%v18,%%v18,%%v26,8\n\t"
|
||||||
|
|
@ -70,32 +62,25 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x)
|
||||||
"vfminsb %%v21,%%v21,%%v29,8\n\t"
|
"vfminsb %%v21,%%v21,%%v29,8\n\t"
|
||||||
"vfminsb %%v22,%%v22,%%v30,8\n\t"
|
"vfminsb %%v22,%%v22,%%v30,8\n\t"
|
||||||
"vfminsb %%v23,%%v23,%%v31,8\n\t"
|
"vfminsb %%v23,%%v23,%%v31,8\n\t"
|
||||||
|
|
||||||
"vfminsb %%v16,%%v16,%%v20,8\n\t"
|
"vfminsb %%v16,%%v16,%%v20,8\n\t"
|
||||||
"vfminsb %%v17,%%v17,%%v21,8\n\t"
|
"vfminsb %%v17,%%v17,%%v21,8\n\t"
|
||||||
"vfminsb %%v18,%%v18,%%v22,8\n\t"
|
"vfminsb %%v18,%%v18,%%v22,8\n\t"
|
||||||
"vfminsb %%v19,%%v19,%%v23,8\n\t"
|
"vfminsb %%v19,%%v19,%%v23,8\n\t"
|
||||||
|
|
||||||
"vfminsb %%v16,%%v16,%%v18,8\n\t"
|
"vfminsb %%v16,%%v16,%%v18,8\n\t"
|
||||||
"vfminsb %%v17,%%v17,%%v19,8\n\t"
|
"vfminsb %%v17,%%v17,%%v19,8\n\t"
|
||||||
|
|
||||||
"vfminsb %%v16,%%v16,%%v17,8\n\t"
|
"vfminsb %%v16,%%v16,%%v17,8\n\t"
|
||||||
|
|
||||||
"vfminsb %%v0,%%v0,%%v16,8\n\t"
|
"vfminsb %%v0,%%v0,%%v16,8\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"veslg %%v16,%%v0,32\n\t"
|
"veslg %%v16,%%v0,32\n\t"
|
||||||
"vfminsb %%v0,%%v0,%%v16,8\n\t"
|
"vfminsb %%v0,%%v0,%%v16,8\n\t"
|
||||||
|
|
||||||
"vrepf %%v16,%%v0,2\n\t"
|
"vrepf %%v16,%%v0,2\n\t"
|
||||||
"wfminsb %%v0,%%v0,%%v16,8\n\t"
|
"wfminsb %%v0,%%v0,%%v16,8\n\t"
|
||||||
"lper %0,%%f0 "
|
"lper %[amin],%%f0"
|
||||||
:"=f"(amin)
|
: [amin] "=f"(amin),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return amin;
|
return amin;
|
||||||
}
|
}
|
||||||
|
|
@ -105,7 +90,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
BLASLONG j = 0;
|
BLASLONG j = 0;
|
||||||
FLOAT minf = 0.0;
|
FLOAT minf = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (minf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (minf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -115,9 +101,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
minf = samin_kernel_64(n1, x);
|
minf = samin_kernel_64(n1, x);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = ABS(x[0]);
|
minf = ABS(x[0]);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -156,7 +140,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (ABS(x[i]) < minf) {
|
if (ABS(x[i]) < minf) {
|
||||||
minf = ABS(x[i]);
|
minf = ABS(x[i]);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,34 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
#define ABS fabsf
|
||||||
#endif
|
|
||||||
|
|
||||||
static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
|
static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT asum;
|
FLOAT asum;
|
||||||
|
|
||||||
__asm__ (
|
__asm__("vzero %%v24\n\t"
|
||||||
"vzero %%v0 \n\t"
|
"vzero %%v25\n\t"
|
||||||
"vzero %%v1 \n\t"
|
"vzero %%v26\n\t"
|
||||||
"vzero %%v2 \n\t"
|
"vzero %%v27\n\t"
|
||||||
"vzero %%v3 \n\t"
|
"vzero %%v28\n\t"
|
||||||
"srlg %%r0,%1,6 \n\t"
|
"vzero %%v29\n\t"
|
||||||
|
"vzero %%v30\n\t"
|
||||||
|
"vzero %%v31\n\t"
|
||||||
|
"srlg %[n],%[n],6\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||||
|
|
||||||
"vflpsb %%v16, %%v16\n\t"
|
"vflpsb %%v16, %%v16\n\t"
|
||||||
"vflpsb %%v17, %%v17\n\t"
|
"vflpsb %%v17, %%v17\n\t"
|
||||||
"vflpsb %%v18, %%v18\n\t"
|
"vflpsb %%v18, %%v18\n\t"
|
||||||
|
|
@ -64,25 +61,22 @@ static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
|
||||||
"vflpsb %%v21, %%v21\n\t"
|
"vflpsb %%v21, %%v21\n\t"
|
||||||
"vflpsb %%v22, %%v22\n\t"
|
"vflpsb %%v22, %%v22\n\t"
|
||||||
"vflpsb %%v23, %%v23\n\t"
|
"vflpsb %%v23, %%v23\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||||
"vfasb %%v1,%%v1,%%v17 \n\t"
|
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||||
"vfasb %%v2,%%v2,%%v18 \n\t"
|
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||||
"vfasb %%v3,%%v3,%%v19 \n\t"
|
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v20 \n\t"
|
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||||
"vfasb %%v1,%%v1,%%v21 \n\t"
|
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||||
"vfasb %%v2,%%v2,%%v22 \n\t"
|
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||||
"vfasb %%v3,%%v3,%%v23 \n\t"
|
"vl %%v16, 128(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v17, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
"vl %%v18, 160(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
"vl %%v19, 176(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
"vl %%v20, 192(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
"vl %%v21, 208(%%r1,%[x])\n\t"
|
||||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
"vl %%v22, 224(%%r1,%[x])\n\t"
|
||||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
"vl %%v23, 240(%%r1,%[x])\n\t"
|
||||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
|
||||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vflpsb %%v16, %%v16\n\t"
|
"vflpsb %%v16, %%v16\n\t"
|
||||||
"vflpsb %%v17, %%v17\n\t"
|
"vflpsb %%v17, %%v17\n\t"
|
||||||
"vflpsb %%v18, %%v18\n\t"
|
"vflpsb %%v18, %%v18\n\t"
|
||||||
|
|
@ -91,30 +85,32 @@ static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
|
||||||
"vflpsb %%v21, %%v21\n\t"
|
"vflpsb %%v21, %%v21\n\t"
|
||||||
"vflpsb %%v22, %%v22\n\t"
|
"vflpsb %%v22, %%v22\n\t"
|
||||||
"vflpsb %%v23, %%v23\n\t"
|
"vflpsb %%v23, %%v23\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v16 \n\t"
|
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||||
"vfasb %%v1,%%v1,%%v17 \n\t"
|
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||||
"vfasb %%v2,%%v2,%%v18 \n\t"
|
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||||
"vfasb %%v3,%%v3,%%v19 \n\t"
|
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v20 \n\t"
|
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||||
"vfasb %%v1,%%v1,%%v21 \n\t"
|
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||||
"vfasb %%v2,%%v2,%%v22 \n\t"
|
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||||
"vfasb %%v3,%%v3,%%v23 \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v2 \n\t"
|
"vfasb %%v24,%%v24,%%v26\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v3 \n\t"
|
"vfasb %%v24,%%v24,%%v27\n\t"
|
||||||
"veslg %%v1,%%v0,32 \n\t"
|
"vfasb %%v24,%%v24,%%v28\n\t"
|
||||||
"vfasb %%v0,%%v0,%%v1 \n\t"
|
"vfasb %%v24,%%v24,%%v29\n\t"
|
||||||
"vrepf %%v1,%%v0,2 \n\t"
|
"vfasb %%v24,%%v24,%%v30\n\t"
|
||||||
"aebr %%f0,%%f1 \n\t"
|
"vfasb %%v24,%%v24,%%v31\n\t"
|
||||||
"ler %0,%%f0 "
|
"veslg %%v25,%%v24,32\n\t"
|
||||||
:"=f"(asum)
|
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
"vrepf %%v25,%%v24,2\n\t"
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
|
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||||
);
|
"vstef %%v24,%[asum],0"
|
||||||
|
: [asum] "=Q"(asum),[n] "+&r"(n)
|
||||||
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
|
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||||
|
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return asum;
|
return asum;
|
||||||
}
|
}
|
||||||
|
|
@ -125,7 +121,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT sumf = 0.0;
|
FLOAT sumf = 0.0;
|
||||||
BLASLONG n1;
|
BLASLONG n1;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return sumf;
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return sumf;
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -166,9 +163,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
return sumf;
|
return sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,107 +27,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
|
||||||
{
|
__asm__("vlrepf %%v0,%[alpha]\n\t"
|
||||||
__asm__ volatile(
|
"srlg %[n],%[n],6\n\t"
|
||||||
"vlrepf %%v0,%3 \n\t"
|
|
||||||
"srlg %%r0,%0,6 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%1) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%1) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%1) \n\t"
|
"vl %%v20,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v20,0(%%r1,%2) \n\t"
|
"vl %%v21,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v21,16(%%r1,%2) \n\t"
|
"vl %%v22,32(%%r1,%[y])\n\t"
|
||||||
"vl %%v22,32(%%r1,%2) \n\t"
|
"vl %%v23,48(%%r1,%[y])\n\t"
|
||||||
"vl %%v23,48(%%r1,%2) \n\t"
|
"vl %%v24,64(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25,80(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v26,96(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v27,112(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v28,64(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v29,80(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v30,96(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v31,112(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
|
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
|
||||||
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
|
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
|
||||||
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
|
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
|
||||||
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
|
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
|
||||||
|
"vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
|
||||||
"vl %%v24,64(%%r1,%1) \n\t"
|
"vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
|
||||||
"vl %%v25,80(%%r1,%1) \n\t"
|
"vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
|
||||||
"vl %%v26,96(%%r1,%1) \n\t"
|
"vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
|
||||||
"vl %%v27,112(%%r1,%1) \n\t"
|
"vst %%v16,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v28,64(%%r1,%2) \n\t"
|
"vst %%v17,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v29,80(%%r1,%2) \n\t"
|
"vst %%v18,32(%%r1,%[y])\n\t"
|
||||||
"vl %%v30,96(%%r1,%2) \n\t"
|
"vst %%v19,48(%%r1,%[y])\n\t"
|
||||||
"vl %%v31,112(%%r1,%2) \n\t"
|
"vst %%v24,64(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v25,80(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
|
"vst %%v26,96(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
|
"vst %%v27,112(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
|
"vl %%v16,128(%%r1,%[x])\n\t"
|
||||||
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
|
"vl %%v17,144(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v18,160(%%r1,%[x])\n\t"
|
||||||
"vst %%v16,0(%%r1,%2) \n\t"
|
"vl %%v19,176(%%r1,%[x])\n\t"
|
||||||
"vst %%v17,16(%%r1,%2) \n\t"
|
"vl %%v20,128(%%r1,%[y])\n\t"
|
||||||
"vst %%v18,32(%%r1,%2) \n\t"
|
"vl %%v21,144(%%r1,%[y])\n\t"
|
||||||
"vst %%v19,48(%%r1,%2) \n\t"
|
"vl %%v22,160(%%r1,%[y])\n\t"
|
||||||
"vst %%v20,64(%%r1,%2) \n\t"
|
"vl %%v23,176(%%r1,%[y])\n\t"
|
||||||
"vst %%v21,80(%%r1,%2) \n\t"
|
"vl %%v24,192(%%r1,%[x])\n\t"
|
||||||
"vst %%v22,96(%%r1,%2) \n\t"
|
"vl %%v25,208(%%r1,%[x])\n\t"
|
||||||
"vst %%v23,112(%%r1,%2) \n\t"
|
"vl %%v26,224(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v27,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,128(%%r1,%1) \n\t"
|
"vl %%v28,192(%%r1,%[y])\n\t"
|
||||||
"vl %%v17,144(%%r1,%1) \n\t"
|
"vl %%v29,208(%%r1,%[y])\n\t"
|
||||||
"vl %%v18,160(%%r1,%1) \n\t"
|
"vl %%v30,224(%%r1,%[y])\n\t"
|
||||||
"vl %%v19,176(%%r1,%1) \n\t"
|
"vl %%v31,240(%%r1,%[y])\n\t"
|
||||||
"vl %%v20,128(%%r1,%2) \n\t"
|
|
||||||
"vl %%v21,144(%%r1,%2) \n\t"
|
|
||||||
"vl %%v22,160(%%r1,%2) \n\t"
|
|
||||||
"vl %%v23,176(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
|
"vfmasb %%v16,%%v0,%%v16,%%v20\n\t"
|
||||||
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
|
"vfmasb %%v17,%%v0,%%v17,%%v21\n\t"
|
||||||
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
|
"vfmasb %%v18,%%v0,%%v18,%%v22\n\t"
|
||||||
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
|
"vfmasb %%v19,%%v0,%%v19,%%v23\n\t"
|
||||||
|
"vfmasb %%v24,%%v0,%%v24,%%v28\n\t"
|
||||||
"vl %%v24,192(%%r1,%1) \n\t"
|
"vfmasb %%v25,%%v0,%%v25,%%v29\n\t"
|
||||||
"vl %%v25,208(%%r1,%1) \n\t"
|
"vfmasb %%v26,%%v0,%%v26,%%v30\n\t"
|
||||||
"vl %%v26,224(%%r1,%1) \n\t"
|
"vfmasb %%v27,%%v0,%%v27,%%v31\n\t"
|
||||||
"vl %%v27,240(%%r1,%1) \n\t"
|
"vst %%v16,128(%%r1,%[y])\n\t"
|
||||||
"vl %%v28,192(%%r1,%2) \n\t"
|
"vst %%v17,144(%%r1,%[y])\n\t"
|
||||||
"vl %%v29,208(%%r1,%2) \n\t"
|
"vst %%v18,160(%%r1,%[y])\n\t"
|
||||||
"vl %%v30,224(%%r1,%2) \n\t"
|
"vst %%v19,176(%%r1,%[y])\n\t"
|
||||||
"vl %%v31,240(%%r1,%2) \n\t"
|
"vst %%v24,192(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v25,208(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
|
"vst %%v26,224(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
|
"vst %%v27,240(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
|
|
||||||
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
|
|
||||||
|
|
||||||
"vst %%v16,128(%%r1,%2) \n\t"
|
|
||||||
"vst %%v17,144(%%r1,%2) \n\t"
|
|
||||||
"vst %%v18,160(%%r1,%2) \n\t"
|
|
||||||
"vst %%v19,176(%%r1,%2) \n\t"
|
|
||||||
"vst %%v20,192(%%r1,%2) \n\t"
|
|
||||||
"vst %%v21,208(%%r1,%2) \n\t"
|
|
||||||
"vst %%v22,224(%%r1,%2) \n\t"
|
|
||||||
"vst %%v23,240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
[alpha] "Q"(*alpha)
|
||||||
);
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
|
||||||
{
|
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||||
|
BLASLONG dummy2) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
|
|
||||||
if ( n <= 0 ) return 0 ;
|
if (n <= 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1) )
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -64;
|
BLASLONG n1 = n & -64;
|
||||||
|
|
||||||
|
|
@ -135,8 +124,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
saxpy_kernel_64(n1, x, y, &da);
|
saxpy_kernel_64(n1, x, y, &da);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
y[i] += da * x[i];
|
y[i] += da * x[i];
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -144,13 +132,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BLASLONG n1 = n & -4;
|
BLASLONG n1 = n & -4;
|
||||||
|
|
||||||
while(i < n1)
|
while (i < n1) {
|
||||||
{
|
|
||||||
|
|
||||||
FLOAT m1 = da * x[ix];
|
FLOAT m1 = da * x[ix];
|
||||||
FLOAT m2 = da * x[ix + inc_x];
|
FLOAT m2 = da * x[ix + inc_x];
|
||||||
|
|
@ -168,8 +154,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
y[iy] += da * x[ix];
|
y[iy] += da * x[ix];
|
||||||
ix += inc_x;
|
ix += inc_x;
|
||||||
|
|
@ -180,5 +165,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,30 +27,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
|
static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||||
{
|
__asm__("srlg %[n],%[n],6\n\t"
|
||||||
__asm__ volatile (
|
|
||||||
"lgr %%r1,%1 \n\t"
|
|
||||||
"lgr %%r2,%2 \n\t"
|
|
||||||
"srlg %%r0,%0,6 \n\t"
|
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1) \n\t"
|
"pfd 1, 1024(%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r2) \n\t"
|
"pfd 2, 1024(%[y])\n\t"
|
||||||
"mvc 0(256,%%r2),0(%%r1) \n\t"
|
"mvc 0(256,%[y]),0(%[x])\n\t"
|
||||||
"agfi %%r1,256 \n\t"
|
"la %[x],256(%[x])\n\t"
|
||||||
"agfi %%r2,256 \n\t"
|
"la %[y],256(%[y])\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n)
|
||||||
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
|
: "m"(*(const struct { FLOAT x[n]; } *) x)
|
||||||
:"memory","cc","r0","r1","r2"
|
: "cc");
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
|
|
||||||
if (n <= 0) return 0;
|
if (n <= 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
if ((inc_x == 1) && (inc_y == 1)) {
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
|
|
||||||
|
|
@ -66,7 +62,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
while (i < n) {
|
while (i < n) {
|
||||||
|
|
@ -81,5 +76,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018,The OpenBLAS Project
|
Copyright (c) 2013-2019,The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms,with or without
|
Redistribution and use in source and binary forms,with or without
|
||||||
modification,are permitted provided that the following conditions are
|
modification,are permitted provided that the following conditions are
|
||||||
|
|
@ -27,72 +27,82 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||||
{
|
|
||||||
FLOAT dot;
|
FLOAT dot;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vzero %%v0\n\t"
|
||||||
"vzero %%v0 \n\t"
|
"vzero %%v1\n\t"
|
||||||
"srlg %%r0,%1,5 \n\t"
|
"vzero %%v2\n\t"
|
||||||
|
"vzero %%v3\n\t"
|
||||||
|
"vzero %%v4\n\t"
|
||||||
|
"vzero %%v5\n\t"
|
||||||
|
"vzero %%v6\n\t"
|
||||||
|
"vzero %%v7\n\t"
|
||||||
|
"srlg %[n],%[n],5\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[x])\n\t"
|
||||||
"pfd 1,1024(%%r1,%3) \n\t"
|
"pfd 1,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
"vl %%v24,0(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v25,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v24,0(%%r1,%3) \n\t"
|
"vl %%v26,32(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v27,48(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v28,64(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v29,80(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v30,96(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v31,112(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
|
"vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
|
||||||
"vl %%v25,16(%%r1,%3) \n\t"
|
"vfmasb %%v1,%%v17,%%v25,%%v1\n\t"
|
||||||
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"
|
"vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
|
||||||
"vl %%v26,32(%%r1,%3) \n\t"
|
"vfmasb %%v3,%%v19,%%v27,%%v3\n\t"
|
||||||
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"
|
"vfmasb %%v4,%%v20,%%v28,%%v4\n\t"
|
||||||
"vl %%v27,48(%%r1,%3) \n\t"
|
"vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
|
||||||
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"
|
"vfmasb %%v6,%%v22,%%v30,%%v6\n\t"
|
||||||
"vl %%v28,64(%%r1,%3) \n\t"
|
"vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
|
||||||
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
|
|
||||||
"vl %%v29,80(%%r1,%3) \n\t"
|
|
||||||
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
|
|
||||||
"vl %%v30,96(%%r1,%3) \n\t"
|
|
||||||
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
|
|
||||||
"vl %%v31,112(%%r1,%3) \n\t"
|
|
||||||
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
|
"vfasb %%v0,%%v0,%%v1\n\t"
|
||||||
|
"vfasb %%v0,%%v0,%%v2\n\t"
|
||||||
|
"vfasb %%v0,%%v0,%%v3\n\t"
|
||||||
|
"vfasb %%v0,%%v0,%%v4\n\t"
|
||||||
|
"vfasb %%v0,%%v0,%%v5\n\t"
|
||||||
|
"vfasb %%v0,%%v0,%%v6\n\t"
|
||||||
|
"vfasb %%v0,%%v0,%%v7\n\t"
|
||||||
"vrepf %%v1,%%v0,1\n\t"
|
"vrepf %%v1,%%v0,1\n\t"
|
||||||
"vrepf %%v2,%%v0,2\n\t"
|
"vrepf %%v2,%%v0,2\n\t"
|
||||||
"vrepf %%v3,%%v0,3\n\t"
|
"vrepf %%v3,%%v0,3\n\t"
|
||||||
"aebr %%f0,%%f1\n\t"
|
"aebr %%f0,%%f1\n\t"
|
||||||
"aebr %%f0,%%f2\n\t"
|
"aebr %%f0,%%f2\n\t"
|
||||||
"aebr %%f0,%%f3\n\t"
|
"aebr %%f0,%%f3\n\t"
|
||||||
"ler %0,%%f0 "
|
"ler %[dot],%%f0"
|
||||||
:"=f"(dot)
|
: [dot] "=f"(dot),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return dot;
|
return dot;
|
||||||
}
|
}
|
||||||
|
|
||||||
FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||||
{
|
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
|
|
||||||
FLOAT dot = 0.0;
|
FLOAT dot = 0.0;
|
||||||
|
|
||||||
if ( n <= 0 ) return(dot);
|
if (n <= 0)
|
||||||
|
return (dot);
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1) )
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -32;
|
BLASLONG n1 = n & -32;
|
||||||
|
|
||||||
|
|
@ -100,8 +110,7 @@ FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
||||||
dot = sdot_kernel_32(n1, x, y);
|
dot = sdot_kernel_32(n1, x, y);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
dot += y[i] * x[i];
|
dot += y[i] * x[i];
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -109,13 +118,11 @@ FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
||||||
}
|
}
|
||||||
return (dot);
|
return (dot);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BLASLONG n1 = n & -2;
|
BLASLONG n1 = n & -2;
|
||||||
|
|
||||||
while(i < n1)
|
while (i < n1) {
|
||||||
{
|
|
||||||
|
|
||||||
dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x];
|
dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x];
|
||||||
ix += inc_x * 2;
|
ix += inc_x * 2;
|
||||||
|
|
@ -124,8 +131,7 @@ FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
dot += y[iy] * x[ix];
|
dot += y[iy] * x[ix];
|
||||||
ix += inc_x;
|
ix += inc_x;
|
||||||
|
|
@ -136,5 +142,3 @@ FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
|
||||||
return (dot);
|
return (dot);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2017, The OpenBLAS Project
|
Copyright (c) 2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -29,364 +29,329 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define NBMAX 2048
|
#define NBMAX 2048
|
||||||
|
|
||||||
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
|
||||||
{
|
FLOAT *alpha) {
|
||||||
__asm__ volatile (
|
register FLOAT *ap0 = ap[0];
|
||||||
"vlrepf %%v0,0(%5) \n\t"
|
register FLOAT *ap1 = ap[1];
|
||||||
"vlrepf %%v1,4(%5) \n\t"
|
register FLOAT *ap2 = ap[2];
|
||||||
"vlrepf %%v2,8(%5) \n\t"
|
register FLOAT *ap3 = ap[3];
|
||||||
"vlrepf %%v3,12(%5) \n\t"
|
|
||||||
"vlrepf %%v4,%7 \n\t"
|
__asm__("vlrepf %%v0,0(%[x])\n\t"
|
||||||
|
"vlrepf %%v1,4(%[x])\n\t"
|
||||||
|
"vlrepf %%v2,8(%[x])\n\t"
|
||||||
|
"vlrepf %%v3,12(%[x])\n\t"
|
||||||
|
"vlrepf %%v4,%[alpha]\n\t"
|
||||||
"vfmsb %%v0,%%v0,%%v4\n\t"
|
"vfmsb %%v0,%%v0,%%v4\n\t"
|
||||||
"vfmsb %%v1,%%v1,%%v4\n\t"
|
"vfmsb %%v1,%%v1,%%v4\n\t"
|
||||||
"vfmsb %%v2,%%v2,%%v4\n\t"
|
"vfmsb %%v2,%%v2,%%v4\n\t"
|
||||||
"vfmsb %%v3,%%v3,%%v4\n\t"
|
"vfmsb %%v3,%%v3,%%v4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
|
|
||||||
"lghi %%r0,-32\n\t"
|
"lghi %%r0,-32\n\t"
|
||||||
"ngr %%r0,%0 \n\t"
|
"ngr %%r0,%[n]\n\t"
|
||||||
"ltgr %%r0,%%r0\n\t"
|
"ltgr %%r0,%%r0\n\t"
|
||||||
"jz 1f\n\t"
|
"jz 1f\n\t"
|
||||||
|
|
||||||
"srlg %%r0,%%r0,5\n\t"
|
"srlg %%r0,%%r0,5\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||||
"pfd 1,1024(%%r1,%3) \n\t"
|
"pfd 1,1024(%%r1,%[ap2])\n\t"
|
||||||
"pfd 1,1024(%%r1,%4) \n\t"
|
"pfd 1,1024(%%r1,%[ap3])\n\t"
|
||||||
"pfd 2,1024(%%r1,%6) \n\t"
|
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v17,0(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v17,0(%%r1,%2) \n\t"
|
"vl %%v18,0(%%r1,%[ap2])\n\t"
|
||||||
"vl %%v18,0(%%r1,%3) \n\t"
|
"vl %%v19,0(%%r1,%[ap3])\n\t"
|
||||||
"vl %%v19,0(%%r1,%4) \n\t"
|
"vl %%v20,16(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v20,16(%%r1,%1) \n\t"
|
"vl %%v21,16(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v21,16(%%r1,%2) \n\t"
|
"vl %%v22,16(%%r1,%[ap2])\n\t"
|
||||||
"vl %%v22,16(%%r1,%3) \n\t"
|
"vl %%v23,16(%%r1,%[ap3])\n\t"
|
||||||
"vl %%v23,16(%%r1,%4) \n\t"
|
"vl %%v24,32(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v24,32(%%r1,%1) \n\t"
|
"vl %%v25,32(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v25,32(%%r1,%2) \n\t"
|
"vl %%v26,32(%%r1,%[ap2])\n\t"
|
||||||
"vl %%v26,32(%%r1,%3) \n\t"
|
"vl %%v27,32(%%r1,%[ap3])\n\t"
|
||||||
"vl %%v27,32(%%r1,%4) \n\t"
|
"vl %%v28,48(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v28,48(%%r1,%1) \n\t"
|
"vl %%v29,48(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v29,48(%%r1,%2) \n\t"
|
"vl %%v30,48(%%r1,%[ap2])\n\t"
|
||||||
"vl %%v30,48(%%r1,%3) \n\t"
|
"vl %%v31,48(%%r1,%[ap3])\n\t"
|
||||||
"vl %%v31,48(%%r1,%4) \n\t"
|
"vl %%v4,0(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v5,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v4,0(%%r1,%6) \n\t"
|
"vl %%v6,32(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v7,48(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
|
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
|
||||||
|
"vfmasb %%v5,%%v20,%%v0,%%v5\n\t"
|
||||||
|
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
|
||||||
|
"vfmasb %%v7,%%v28,%%v0,%%v7\n\t"
|
||||||
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
|
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
|
||||||
|
"vfmasb %%v5,%%v21,%%v1,%%v5\n\t"
|
||||||
|
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
|
||||||
|
"vfmasb %%v7,%%v29,%%v1,%%v7\n\t"
|
||||||
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
|
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
|
||||||
|
"vfmasb %%v5,%%v22,%%v2,%%v5\n\t"
|
||||||
|
"vfmasb %%v6,%%v26,%%v2,%%v6\n\t"
|
||||||
|
"vfmasb %%v7,%%v30,%%v2,%%v7\n\t"
|
||||||
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
|
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
|
||||||
"vst %%v4,0(%%r1,%6) \n\t"
|
"vfmasb %%v5,%%v23,%%v3,%%v5\n\t"
|
||||||
|
"vfmasb %%v6,%%v27,%%v3,%%v6\n\t"
|
||||||
"vl %%v4,16(%%r1,%6) \n\t"
|
"vfmasb %%v7,%%v31,%%v3,%%v7\n\t"
|
||||||
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
|
"vst %%v4,0(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
|
"vst %%v5,16(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
|
"vst %%v6,32(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
|
"vst %%v7,48(%%r1,%[y])\n\t"
|
||||||
"vst %%v4,16(%%r1,%6) \n\t"
|
"vl %%v16,64(%%r1,%[ap0])\n\t"
|
||||||
|
"vl %%v17,64(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v4,32(%%r1,%6) \n\t"
|
"vl %%v18,64(%%r1,%[ap2])\n\t"
|
||||||
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
|
"vl %%v19,64(%%r1,%[ap3])\n\t"
|
||||||
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
|
"vl %%v20,80(%%r1,%[ap0])\n\t"
|
||||||
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
|
"vl %%v21,80(%%r1,%[ap1])\n\t"
|
||||||
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
|
"vl %%v22,80(%%r1,%[ap2])\n\t"
|
||||||
"vst %%v4,32(%%r1,%6) \n\t"
|
"vl %%v23,80(%%r1,%[ap3])\n\t"
|
||||||
|
"vl %%v24,96(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v4,48(%%r1,%6) \n\t"
|
"vl %%v25,96(%%r1,%[ap1])\n\t"
|
||||||
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
|
"vl %%v26,96(%%r1,%[ap2])\n\t"
|
||||||
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
|
"vl %%v27,96(%%r1,%[ap3])\n\t"
|
||||||
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
|
"vl %%v28,112(%%r1,%[ap0])\n\t"
|
||||||
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
|
"vl %%v29,112(%%r1,%[ap1])\n\t"
|
||||||
"vst %%v4,48(%%r1,%6) \n\t"
|
"vl %%v30,112(%%r1,%[ap2])\n\t"
|
||||||
|
"vl %%v31,112(%%r1,%[ap3])\n\t"
|
||||||
"vl %%v16,64(%%r1,%1) \n\t"
|
"vl %%v4,64(%%r1,%[y])\n\t"
|
||||||
"vl %%v17,64(%%r1,%2) \n\t"
|
"vl %%v5,80(%%r1,%[y])\n\t"
|
||||||
"vl %%v18,64(%%r1,%3) \n\t"
|
"vl %%v6,96(%%r1,%[y])\n\t"
|
||||||
"vl %%v19,64(%%r1,%4) \n\t"
|
"vl %%v7,112(%%r1,%[y])\n\t"
|
||||||
"vl %%v20,80(%%r1,%1) \n\t"
|
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
|
||||||
"vl %%v22,80(%%r1,%3) \n\t"
|
|
||||||
"vl %%v23,80(%%r1,%4) \n\t"
|
|
||||||
"vl %%v24,96(%%r1,%1) \n\t"
|
|
||||||
"vl %%v25,96(%%r1,%2) \n\t"
|
|
||||||
"vl %%v26,96(%%r1,%3) \n\t"
|
|
||||||
"vl %%v27,96(%%r1,%4) \n\t"
|
|
||||||
"vl %%v28,112(%%r1,%1) \n\t"
|
|
||||||
"vl %%v29,112(%%r1,%2) \n\t"
|
|
||||||
"vl %%v30,112(%%r1,%3) \n\t"
|
|
||||||
"vl %%v31,112(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"vl %%v4,64(%%r1,%6) \n\t"
|
|
||||||
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
|
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
|
||||||
|
"vfmasb %%v5,%%v20,%%v0,%%v5\n\t"
|
||||||
|
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
|
||||||
|
"vfmasb %%v7,%%v28,%%v0,%%v7\n\t"
|
||||||
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
|
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
|
||||||
|
"vfmasb %%v5,%%v21,%%v1,%%v5\n\t"
|
||||||
|
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
|
||||||
|
"vfmasb %%v7,%%v29,%%v1,%%v7\n\t"
|
||||||
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
|
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
|
||||||
|
"vfmasb %%v5,%%v22,%%v2,%%v5\n\t"
|
||||||
|
"vfmasb %%v6,%%v26,%%v2,%%v6\n\t"
|
||||||
|
"vfmasb %%v7,%%v30,%%v2,%%v7\n\t"
|
||||||
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
|
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
|
||||||
"vst %%v4,64(%%r1,%6) \n\t"
|
"vfmasb %%v5,%%v23,%%v3,%%v5\n\t"
|
||||||
|
"vfmasb %%v6,%%v27,%%v3,%%v6\n\t"
|
||||||
"vl %%v4,80(%%r1,%6) \n\t"
|
"vfmasb %%v7,%%v31,%%v3,%%v7\n\t"
|
||||||
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
|
"vst %%v4,64(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
|
"vst %%v5,80(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
|
"vst %%v6,96(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
|
"vst %%v7,112(%%r1,%[y])\n\t"
|
||||||
"vst %%v4,80(%%r1,%6) \n\t"
|
|
||||||
|
|
||||||
"vl %%v4,96(%%r1,%6) \n\t"
|
|
||||||
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
|
|
||||||
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
|
|
||||||
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
|
|
||||||
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
|
|
||||||
"vst %%v4,96(%%r1,%6) \n\t"
|
|
||||||
|
|
||||||
"vl %%v4,112(%%r1,%6) \n\t"
|
|
||||||
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
|
|
||||||
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
|
|
||||||
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
|
|
||||||
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
|
|
||||||
"vst %%v4,112(%%r1,%6) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b\n\t"
|
"brctg %%r0,0b\n\t"
|
||||||
|
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"lghi %%r0,28\n\t"
|
"lghi %%r0,28\n\t"
|
||||||
"ngr %%r0,%0 \n\t"
|
"ngr %%r0,%[n]\n\t"
|
||||||
"ltgr %%r0,%%r0\n\t"
|
"ltgr %%r0,%%r0\n\t"
|
||||||
"jz 3f\n\t"
|
"jz 3f\n\t"
|
||||||
|
|
||||||
"srlg %%r0,%%r0,2\n\t"
|
"srlg %%r0,%%r0,2\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v16,0(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v17,0(%%r1,%2) \n\t"
|
"vl %%v17,0(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v18,0(%%r1,%3) \n\t"
|
"vl %%v18,0(%%r1,%[ap2])\n\t"
|
||||||
"vl %%v19,0(%%r1,%4) \n\t"
|
"vl %%v19,0(%%r1,%[ap3])\n\t"
|
||||||
|
"vl %%v4,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v4,0(%%r1,%6) \n\t"
|
|
||||||
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
|
"vfmasb %%v4,%%v16,%%v0,%%v4\n\t"
|
||||||
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
|
"vfmasb %%v4,%%v17,%%v1,%%v4\n\t"
|
||||||
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
|
"vfmasb %%v4,%%v18,%%v2,%%v4\n\t"
|
||||||
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
|
"vfmasb %%v4,%%v19,%%v3,%%v4\n\t"
|
||||||
"vst %%v4,0(%%r1,%6) \n\t"
|
"vst %%v4,0(%%r1,%[y])\n\t"
|
||||||
|
|
||||||
"agfi %%r1,16\n\t"
|
"agfi %%r1,16\n\t"
|
||||||
"brctg %%r0,2b\n\t"
|
"brctg %%r0,2b\n\t"
|
||||||
|
|
||||||
"3:\n\t"
|
"3:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) y)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
|
||||||
);
|
"m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2),
|
||||||
|
"m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3),
|
||||||
|
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
|
||||||
|
[n] "r"(n)
|
||||||
|
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||||
|
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
|
||||||
|
"v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
|
||||||
{
|
FLOAT *alpha) {
|
||||||
__asm__ volatile (
|
register FLOAT *ap0 = ap[0];
|
||||||
"vlrepf %%v0,0(%3) \n\t"
|
register FLOAT *ap1 = ap[1];
|
||||||
"vlrepf %%v1,4(%3) \n\t"
|
|
||||||
"vlrepf %%v2,%5 \n\t"
|
__asm__("vlrepf %%v0,0(%[x])\n\t"
|
||||||
|
"vlrepf %%v1,4(%[x])\n\t"
|
||||||
|
"vlrepf %%v2,%[alpha]\n\t"
|
||||||
"vfmsb %%v0,%%v0,%%v2\n\t"
|
"vfmsb %%v0,%%v0,%%v2\n\t"
|
||||||
"vfmsb %%v1,%%v1,%%v2\n\t"
|
"vfmsb %%v1,%%v1,%%v2\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
|
|
||||||
"lghi %%r0,-32\n\t"
|
"lghi %%r0,-32\n\t"
|
||||||
"ngr %%r0,%0 \n\t"
|
"ngr %%r0,%[n]\n\t"
|
||||||
"ltgr %%r0,%%r0\n\t"
|
"ltgr %%r0,%%r0\n\t"
|
||||||
"jz 1f\n\t"
|
"jz 1f\n\t"
|
||||||
|
|
||||||
"srlg %%r0,%%r0,5\n\t"
|
"srlg %%r0,%%r0,5\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||||
"pfd 2,1024(%%r1,%4) \n\t"
|
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v17,0(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v17,0(%%r1,%2) \n\t"
|
"vl %%v18,16(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v18,16(%%r1,%1) \n\t"
|
"vl %%v19,16(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v19,16(%%r1,%2) \n\t"
|
"vl %%v20,32(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v20,32(%%r1,%1) \n\t"
|
"vl %%v21,32(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v21,32(%%r1,%2) \n\t"
|
"vl %%v22,48(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v22,48(%%r1,%1) \n\t"
|
"vl %%v23,48(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v23,48(%%r1,%2) \n\t"
|
"vl %%v24,64(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v24,64(%%r1,%1) \n\t"
|
"vl %%v25,64(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v25,64(%%r1,%2) \n\t"
|
"vl %%v26,80(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v26,80(%%r1,%1) \n\t"
|
"vl %%v27,80(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v27,80(%%r1,%2) \n\t"
|
"vl %%v28,96(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v28,96(%%r1,%1) \n\t"
|
"vl %%v29,96(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v29,96(%%r1,%2) \n\t"
|
"vl %%v30,112(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v30,112(%%r1,%1) \n\t"
|
"vl %%v31,112(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v31,112(%%r1,%2) \n\t"
|
"vl %%v2,0(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v3,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v2,0(%%r1,%4) \n\t"
|
"vl %%v4,32(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v5,48(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v6,64(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v7,80(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v8,96(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v9,112(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v2,%%v16,%%v0,%%v2\n\t"
|
"vfmasb %%v2,%%v16,%%v0,%%v2\n\t"
|
||||||
|
"vfmasb %%v3,%%v18,%%v0,%%v3\n\t"
|
||||||
|
"vfmasb %%v4,%%v20,%%v0,%%v4\n\t"
|
||||||
|
"vfmasb %%v5,%%v22,%%v0,%%v5\n\t"
|
||||||
|
"vfmasb %%v6,%%v24,%%v0,%%v6\n\t"
|
||||||
|
"vfmasb %%v7,%%v26,%%v0,%%v7\n\t"
|
||||||
|
"vfmasb %%v8,%%v28,%%v0,%%v8\n\t"
|
||||||
|
"vfmasb %%v9,%%v30,%%v0,%%v9\n\t"
|
||||||
"vfmasb %%v2,%%v17,%%v1,%%v2\n\t"
|
"vfmasb %%v2,%%v17,%%v1,%%v2\n\t"
|
||||||
"vst %%v2,0(%%r1,%4) \n\t"
|
"vfmasb %%v3,%%v19,%%v1,%%v3\n\t"
|
||||||
|
"vfmasb %%v4,%%v21,%%v1,%%v4\n\t"
|
||||||
"vl %%v2,16(%%r1,%4) \n\t"
|
"vfmasb %%v5,%%v23,%%v1,%%v5\n\t"
|
||||||
"vfmasb %%v2,%%v18,%%v0,%%v2 \n\t"
|
"vfmasb %%v6,%%v25,%%v1,%%v6\n\t"
|
||||||
"vfmasb %%v2,%%v19,%%v1,%%v2 \n\t"
|
"vfmasb %%v7,%%v27,%%v1,%%v7\n\t"
|
||||||
"vst %%v2,16(%%r1,%4) \n\t"
|
"vfmasb %%v8,%%v29,%%v1,%%v8\n\t"
|
||||||
|
"vfmasb %%v9,%%v31,%%v1,%%v9\n\t"
|
||||||
"vl %%v2,32(%%r1,%4) \n\t"
|
"vst %%v2,0(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v2,%%v20,%%v0,%%v2 \n\t"
|
"vst %%v3,16(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v2,%%v21,%%v1,%%v2 \n\t"
|
"vst %%v4,32(%%r1,%[y])\n\t"
|
||||||
"vst %%v2,32(%%r1,%4) \n\t"
|
"vst %%v5,48(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v6,64(%%r1,%[y])\n\t"
|
||||||
"vl %%v2,48(%%r1,%4) \n\t"
|
"vst %%v7,80(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v2,%%v22,%%v0,%%v2 \n\t"
|
"vst %%v8,96(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v2,%%v23,%%v1,%%v2 \n\t"
|
"vst %%v9,112(%%r1,%[y])\n\t"
|
||||||
"vst %%v2,48(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"vl %%v2,64(%%r1,%4) \n\t"
|
|
||||||
"vfmasb %%v2,%%v24,%%v0,%%v2 \n\t"
|
|
||||||
"vfmasb %%v2,%%v25,%%v1,%%v2 \n\t"
|
|
||||||
"vst %%v2,64(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"vl %%v2,80(%%r1,%4) \n\t"
|
|
||||||
"vfmasb %%v2,%%v26,%%v0,%%v2 \n\t"
|
|
||||||
"vfmasb %%v2,%%v27,%%v1,%%v2 \n\t"
|
|
||||||
"vst %%v2,80(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"vl %%v2,96(%%r1,%4) \n\t"
|
|
||||||
"vfmasb %%v2,%%v28,%%v0,%%v2 \n\t"
|
|
||||||
"vfmasb %%v2,%%v29,%%v1,%%v2 \n\t"
|
|
||||||
"vst %%v2,96(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"vl %%v2,112(%%r1,%4) \n\t"
|
|
||||||
"vfmasb %%v2,%%v30,%%v0,%%v2 \n\t"
|
|
||||||
"vfmasb %%v2,%%v31,%%v1,%%v2 \n\t"
|
|
||||||
"vst %%v2,112(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b\n\t"
|
"brctg %%r0,0b\n\t"
|
||||||
|
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"lghi %%r0,28\n\t"
|
"lghi %%r0,28\n\t"
|
||||||
"ngr %%r0,%0 \n\t"
|
"ngr %%r0,%[n]\n\t"
|
||||||
"ltgr %%r0,%%r0\n\t"
|
"ltgr %%r0,%%r0\n\t"
|
||||||
"jz 3f\n\t"
|
"jz 3f\n\t"
|
||||||
|
|
||||||
"srlg %%r0,%%r0,2\n\t"
|
"srlg %%r0,%%r0,2\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v16,0(%%r1,%[ap0])\n\t"
|
||||||
"vl %%v17,0(%%r1,%2) \n\t"
|
"vl %%v17,0(%%r1,%[ap1])\n\t"
|
||||||
|
"vl %%v2,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v2,0(%%r1,%4) \n\t"
|
|
||||||
"vfmasb %%v2,%%v16,%%v0,%%v2\n\t"
|
"vfmasb %%v2,%%v16,%%v0,%%v2\n\t"
|
||||||
"vfmasb %%v2,%%v17,%%v1,%%v2\n\t"
|
"vfmasb %%v2,%%v17,%%v1,%%v2\n\t"
|
||||||
"vst %%v2,0(%%r1,%4) \n\t"
|
"vst %%v2,0(%%r1,%[y])\n\t"
|
||||||
|
|
||||||
"agfi %%r1,16\n\t"
|
"agfi %%r1,16\n\t"
|
||||||
"brctg %%r0,2b\n\t"
|
"brctg %%r0,2b\n\t"
|
||||||
|
|
||||||
"3:\n\t"
|
"3:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) y)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0),
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1),
|
||||||
);
|
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha),
|
||||||
|
[n] "r"(n)
|
||||||
|
: "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||||
|
"v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||||
|
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
|
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y,
|
||||||
{
|
FLOAT *alpha) {
|
||||||
__asm__ volatile (
|
__asm__("vlrepf %%v0,0(%[x])\n\t"
|
||||||
"vlrepf %%v0,0(%2) \n\t"
|
"vlrepf %%v16,%[alpha]\n\t"
|
||||||
"vlrepf %%v1,%4 \n\t"
|
"vfmsb %%v0,%%v0,%%v16\n\t"
|
||||||
"vfmsb %%v0,%%v0,%%v1 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
|
|
||||||
"lghi %%r0,-32\n\t"
|
"lghi %%r0,-32\n\t"
|
||||||
"ngr %%r0,%0 \n\t"
|
"ngr %%r0,%[n]\n\t"
|
||||||
"ltgr %%r0,%%r0\n\t"
|
"ltgr %%r0,%%r0\n\t"
|
||||||
"jz 1f\n\t"
|
"jz 1f\n\t"
|
||||||
|
|
||||||
"srlg %%r0,%%r0,5\n\t"
|
"srlg %%r0,%%r0,5\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[a0])\n\t"
|
||||||
"pfd 2,1024(%%r1,%3) \n\t"
|
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[a0])\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v17,16(%%r1,%[a0])\n\t"
|
||||||
"vl %%v17,16(%%r1,%1) \n\t"
|
"vl %%v18,32(%%r1,%[a0])\n\t"
|
||||||
"vl %%v18,32(%%r1,%1) \n\t"
|
"vl %%v19,48(%%r1,%[a0])\n\t"
|
||||||
"vl %%v19,48(%%r1,%1) \n\t"
|
"vl %%v20,64(%%r1,%[a0])\n\t"
|
||||||
"vl %%v20,64(%%r1,%1) \n\t"
|
"vl %%v21,80(%%r1,%[a0])\n\t"
|
||||||
"vl %%v21,80(%%r1,%1) \n\t"
|
"vl %%v22,96(%%r1,%[a0])\n\t"
|
||||||
"vl %%v22,96(%%r1,%1) \n\t"
|
"vl %%v23,112(%%r1,%[a0])\n\t"
|
||||||
"vl %%v23,112(%%r1,%1) \n\t"
|
"vl %%v24,0(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v25,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v1,0(%%r1,%3) \n\t"
|
"vl %%v26,32(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
|
"vl %%v27,48(%%r1,%[y])\n\t"
|
||||||
"vst %%v1,0(%%r1,%3) \n\t"
|
"vl %%v28,64(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v29,80(%%r1,%[y])\n\t"
|
||||||
"vl %%v1,16(%%r1,%3) \n\t"
|
"vl %%v30,96(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v1,%%v17,%%v0,%%v1 \n\t"
|
"vl %%v31,112(%%r1,%[y])\n\t"
|
||||||
"vst %%v1,16(%%r1,%3) \n\t"
|
"vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
|
||||||
|
"vfmasb %%v25,%%v17,%%v0,%%v25\n\t"
|
||||||
"vl %%v1,32(%%r1,%3) \n\t"
|
"vfmasb %%v26,%%v18,%%v0,%%v26\n\t"
|
||||||
"vfmasb %%v1,%%v18,%%v0,%%v1 \n\t"
|
"vfmasb %%v27,%%v19,%%v0,%%v27\n\t"
|
||||||
"vst %%v1,32(%%r1,%3) \n\t"
|
"vfmasb %%v28,%%v20,%%v0,%%v28\n\t"
|
||||||
|
"vfmasb %%v29,%%v21,%%v0,%%v29\n\t"
|
||||||
"vl %%v1,48(%%r1,%3) \n\t"
|
"vfmasb %%v30,%%v22,%%v0,%%v30\n\t"
|
||||||
"vfmasb %%v1,%%v19,%%v0,%%v1 \n\t"
|
"vfmasb %%v31,%%v23,%%v0,%%v31\n\t"
|
||||||
"vst %%v1,48(%%r1,%3) \n\t"
|
"vst %%v24,0(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v25,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v1,64(%%r1,%3) \n\t"
|
"vst %%v26,32(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v1,%%v20,%%v0,%%v1 \n\t"
|
"vst %%v27,48(%%r1,%[y])\n\t"
|
||||||
"vst %%v1,64(%%r1,%3) \n\t"
|
"vst %%v28,64(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v29,80(%%r1,%[y])\n\t"
|
||||||
"vl %%v1,80(%%r1,%3) \n\t"
|
"vst %%v30,96(%%r1,%[y])\n\t"
|
||||||
"vfmasb %%v1,%%v21,%%v0,%%v1 \n\t"
|
"vst %%v31,112(%%r1,%[y])\n\t"
|
||||||
"vst %%v1,80(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"vl %%v1,96(%%r1,%3) \n\t"
|
|
||||||
"vfmasb %%v1,%%v22,%%v0,%%v1 \n\t"
|
|
||||||
"vst %%v1,96(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"vl %%v1,112(%%r1,%3) \n\t"
|
|
||||||
"vfmasb %%v1,%%v23,%%v0,%%v1 \n\t"
|
|
||||||
"vst %%v1,112(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b\n\t"
|
"brctg %%r0,0b\n\t"
|
||||||
|
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
"lghi %%r0,28\n\t"
|
"lghi %%r0,28\n\t"
|
||||||
"ngr %%r0,%0 \n\t"
|
"ngr %%r0,%[n]\n\t"
|
||||||
"ltgr %%r0,%%r0\n\t"
|
"ltgr %%r0,%%r0\n\t"
|
||||||
"jz 3f\n\t"
|
"jz 3f\n\t"
|
||||||
|
|
||||||
"srlg %%r0,%%r0,2\n\t"
|
"srlg %%r0,%%r0,2\n\t"
|
||||||
"2:\n\t"
|
"2:\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v16,0(%%r1,%[a0])\n\t"
|
||||||
|
"vl %%v17,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v1,0(%%r1,%3) \n\t"
|
"vfmasb %%v17,%%v16,%%v0,%%v17\n\t"
|
||||||
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
|
"vst %%v17,0(%%r1,%[y])\n\t"
|
||||||
"vst %%v1,0(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,16\n\t"
|
"agfi %%r1,16\n\t"
|
||||||
"brctg %%r0,2b\n\t"
|
"brctg %%r0,2b\n\t"
|
||||||
|
|
||||||
"3:\n\t"
|
"3:\n\t"
|
||||||
"nop"
|
"nop"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) y)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0),
|
||||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha),
|
||||||
);
|
[n] "r"(n)
|
||||||
|
: "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) {
|
||||||
{
|
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
for (i = 0; i < n; i++)
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
*dest += src[i];
|
*dest += src[i];
|
||||||
dest += inc_dest;
|
dest += inc_dest;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
|
||||||
{
|
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||||
|
FLOAT *buffer) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
FLOAT *a_ptr;
|
FLOAT *a_ptr;
|
||||||
FLOAT *x_ptr;
|
FLOAT *x_ptr;
|
||||||
|
|
@ -400,8 +365,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
BLASLONG lda4 = lda << 2;
|
BLASLONG lda4 = lda << 2;
|
||||||
FLOAT xbuffer[8], *ybuffer;
|
FLOAT xbuffer[8], *ybuffer;
|
||||||
|
|
||||||
if ( m < 1 ) return(0);
|
if (m < 1)
|
||||||
if ( n < 1 ) return(0);
|
return (0);
|
||||||
|
if (n < 1)
|
||||||
|
return (0);
|
||||||
|
|
||||||
ybuffer = buffer;
|
ybuffer = buffer;
|
||||||
|
|
||||||
|
|
@ -416,13 +383,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
|
|
||||||
BLASLONG NB = NBMAX;
|
BLASLONG NB = NBMAX;
|
||||||
|
|
||||||
while ( NB == NBMAX )
|
while (NB == NBMAX) {
|
||||||
{
|
|
||||||
|
|
||||||
m1 -= NB;
|
m1 -= NB;
|
||||||
if ( m1 < 0)
|
if (m1 < 0) {
|
||||||
{
|
if (m2 == 0)
|
||||||
if ( m2 == 0 ) break;
|
break;
|
||||||
NB = m2;
|
NB = m2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -439,12 +405,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
else
|
else
|
||||||
ybuffer = y_ptr;
|
ybuffer = y_ptr;
|
||||||
|
|
||||||
if ( inc_x == 1 )
|
if (inc_x == 1) {
|
||||||
{
|
|
||||||
|
|
||||||
|
for (i = 0; i < n1; i++) {
|
||||||
for( i = 0; i < n1 ; i++)
|
|
||||||
{
|
|
||||||
sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha);
|
sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha);
|
||||||
ap[0] += lda4;
|
ap[0] += lda4;
|
||||||
ap[1] += lda4;
|
ap[1] += lda4;
|
||||||
|
|
@ -454,29 +417,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
x_ptr += 4;
|
x_ptr += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( n2 & 2 )
|
if (n2 & 2) {
|
||||||
{
|
|
||||||
sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha);
|
sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha);
|
||||||
a_ptr += lda * 2;
|
a_ptr += lda * 2;
|
||||||
x_ptr += 2;
|
x_ptr += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (n2 & 1) {
|
||||||
if ( n2 & 1 )
|
|
||||||
{
|
|
||||||
sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha);
|
sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha);
|
||||||
/* a_ptr += lda;
|
/* a_ptr += lda;
|
||||||
x_ptr += 1; */
|
x_ptr += 1; */
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
for (i = 0; i < n1; i++) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n1 ; i++)
|
|
||||||
{
|
|
||||||
xbuffer[0] = x_ptr[0];
|
xbuffer[0] = x_ptr[0];
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
xbuffer[1] = x_ptr[0];
|
xbuffer[1] = x_ptr[0];
|
||||||
|
|
@ -493,8 +449,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
a_ptr += lda4;
|
a_ptr += lda4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for( i = 0; i < n2 ; i++)
|
for (i = 0; i < n2; i++) {
|
||||||
{
|
|
||||||
xbuffer[0] = x_ptr[0];
|
xbuffer[0] = x_ptr[0];
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha);
|
sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha);
|
||||||
|
|
@ -505,30 +460,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
}
|
}
|
||||||
|
|
||||||
a += NB;
|
a += NB;
|
||||||
if ( inc_y != 1 )
|
if (inc_y != 1) {
|
||||||
{
|
|
||||||
add_y(NB, ybuffer, y_ptr, inc_y);
|
add_y(NB, ybuffer, y_ptr, inc_y);
|
||||||
y_ptr += NB * inc_y;
|
y_ptr += NB * inc_y;
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
y_ptr += NB;
|
y_ptr += NB;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( m3 == 0 ) return(0);
|
if (m3 == 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( m3 == 3 )
|
if (m3 == 3) {
|
||||||
{
|
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
FLOAT temp0 = 0.0;
|
FLOAT temp0 = 0.0;
|
||||||
FLOAT temp1 = 0.0;
|
FLOAT temp1 = 0.0;
|
||||||
FLOAT temp2 = 0.0;
|
FLOAT temp2 = 0.0;
|
||||||
if ( lda == 3 && inc_x ==1 )
|
if (lda == 3 && inc_x == 1) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < ( n & -4 ); i+=4 )
|
for (i = 0; i < (n & -4); i += 4) {
|
||||||
{
|
|
||||||
|
|
||||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
|
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
|
||||||
|
|
@ -542,8 +493,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
x_ptr += 4;
|
x_ptr += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for( ; i < n; i++ )
|
for (; i < n; i++) {
|
||||||
{
|
|
||||||
temp0 += a_ptr[0] * x_ptr[0];
|
temp0 += a_ptr[0] * x_ptr[0];
|
||||||
temp1 += a_ptr[1] * x_ptr[0];
|
temp1 += a_ptr[1] * x_ptr[0];
|
||||||
temp2 += a_ptr[2] * x_ptr[0];
|
temp2 += a_ptr[2] * x_ptr[0];
|
||||||
|
|
@ -551,19 +501,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
x_ptr++;
|
x_ptr++;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n; i++ )
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
temp0 += a_ptr[0] * x_ptr[0];
|
temp0 += a_ptr[0] * x_ptr[0];
|
||||||
temp1 += a_ptr[1] * x_ptr[0];
|
temp1 += a_ptr[1] * x_ptr[0];
|
||||||
temp2 += a_ptr[2] * x_ptr[0];
|
temp2 += a_ptr[2] * x_ptr[0];
|
||||||
a_ptr += lda;
|
a_ptr += lda;
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -575,18 +521,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m3 == 2) {
|
||||||
if ( m3 == 2 )
|
|
||||||
{
|
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
FLOAT temp0 = 0.0;
|
FLOAT temp0 = 0.0;
|
||||||
FLOAT temp1 = 0.0;
|
FLOAT temp1 = 0.0;
|
||||||
if ( lda == 2 && inc_x ==1 )
|
if (lda == 2 && inc_x == 1) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < (n & -4) ; i+=4 )
|
for (i = 0; i < (n & -4); i += 4) {
|
||||||
{
|
|
||||||
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
|
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
|
||||||
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||||
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
|
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
|
||||||
|
|
@ -596,27 +538,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (; i < n; i++) {
|
||||||
for( ; i < n; i++ )
|
|
||||||
{
|
|
||||||
temp0 += a_ptr[0] * x_ptr[0];
|
temp0 += a_ptr[0] * x_ptr[0];
|
||||||
temp1 += a_ptr[1] * x_ptr[0];
|
temp1 += a_ptr[1] * x_ptr[0];
|
||||||
a_ptr += 2;
|
a_ptr += 2;
|
||||||
x_ptr++;
|
x_ptr++;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n; i++ )
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
temp0 += a_ptr[0] * x_ptr[0];
|
temp0 += a_ptr[0] * x_ptr[0];
|
||||||
temp1 += a_ptr[1] * x_ptr[0];
|
temp1 += a_ptr[1] * x_ptr[0];
|
||||||
a_ptr += lda;
|
a_ptr += lda;
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -626,31 +562,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( m3 == 1 )
|
if (m3 == 1) {
|
||||||
{
|
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
FLOAT temp = 0.0;
|
FLOAT temp = 0.0;
|
||||||
if ( lda == 1 && inc_x ==1 )
|
if (lda == 1 && inc_x == 1) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < (n & -4); i+=4 )
|
for (i = 0; i < (n & -4); i += 4) {
|
||||||
{
|
temp +=
|
||||||
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
|
a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i +
|
||||||
|
2] *
|
||||||
|
x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3];
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for( ; i < n; i++ )
|
for (; i < n; i++) {
|
||||||
{
|
|
||||||
temp += a_ptr[i] * x_ptr[i];
|
temp += a_ptr[i] * x_ptr[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n; i++ )
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
temp += a_ptr[0] * x_ptr[0];
|
temp += a_ptr[0] * x_ptr[0];
|
||||||
a_ptr += lda;
|
a_ptr += lda;
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
|
|
@ -661,8 +593,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,34 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x)
|
static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT max;
|
FLOAT max;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%2) \n\t"
|
"srlg %[n],%[n],6\n\t"
|
||||||
"srlg %%r0,%1,6 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
"vl %%v24,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24,128(%%r1,%2) \n\t"
|
"vl %%v25,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v25,144(%%r1,%2) \n\t"
|
"vl %%v26,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26,160(%%r1,%2) \n\t"
|
"vl %%v27,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v27,176(%%r1,%2) \n\t"
|
"vl %%v28,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,192(%%r1,%2) \n\t"
|
"vl %%v29,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v29,208(%%r1,%2) \n\t"
|
"vl %%v30,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,224(%%r1,%2) \n\t"
|
"vl %%v31,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31,240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
|
"vfmaxsb %%v16,%%v16,%%v24,0\n\t"
|
||||||
"vfmaxsb %%v17,%%v17,%%v25,0\n\t"
|
"vfmaxsb %%v17,%%v17,%%v25,0\n\t"
|
||||||
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
|
"vfmaxsb %%v18,%%v18,%%v26,0\n\t"
|
||||||
|
|
@ -63,32 +59,25 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x)
|
||||||
"vfmaxsb %%v21,%%v21,%%v29,0\n\t"
|
"vfmaxsb %%v21,%%v21,%%v29,0\n\t"
|
||||||
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
|
"vfmaxsb %%v22,%%v22,%%v30,0\n\t"
|
||||||
"vfmaxsb %%v23,%%v23,%%v31,0\n\t"
|
"vfmaxsb %%v23,%%v23,%%v31,0\n\t"
|
||||||
|
|
||||||
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
|
"vfmaxsb %%v16,%%v16,%%v20,0\n\t"
|
||||||
"vfmaxsb %%v17,%%v17,%%v21,0\n\t"
|
"vfmaxsb %%v17,%%v17,%%v21,0\n\t"
|
||||||
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
|
"vfmaxsb %%v18,%%v18,%%v22,0\n\t"
|
||||||
"vfmaxsb %%v19,%%v19,%%v23,0\n\t"
|
"vfmaxsb %%v19,%%v19,%%v23,0\n\t"
|
||||||
|
|
||||||
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
|
"vfmaxsb %%v16,%%v16,%%v18,0\n\t"
|
||||||
"vfmaxsb %%v17,%%v17,%%v19,0\n\t"
|
"vfmaxsb %%v17,%%v17,%%v19,0\n\t"
|
||||||
|
|
||||||
"vfmaxsb %%v16,%%v16,%%v17,0\n\t"
|
"vfmaxsb %%v16,%%v16,%%v17,0\n\t"
|
||||||
|
|
||||||
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"veslg %%v16,%%v0,32\n\t"
|
"veslg %%v16,%%v0,32\n\t"
|
||||||
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
"vfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
||||||
|
|
||||||
"vrepf %%v16,%%v0,2\n\t"
|
"vrepf %%v16,%%v0,2\n\t"
|
||||||
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
"wfmaxsb %%v0,%%v0,%%v16,0\n\t"
|
||||||
"ler %0,%%f0 "
|
"ler %[max],%%f0"
|
||||||
:"=f"(max)
|
: [max] "=f"(max),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return max;
|
return max;
|
||||||
}
|
}
|
||||||
|
|
@ -98,7 +87,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
BLASLONG j = 0;
|
BLASLONG j = 0;
|
||||||
FLOAT maxf = 0.0;
|
FLOAT maxf = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (maxf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -108,9 +98,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
maxf = smax_kernel_64(n1, x);
|
maxf = smax_kernel_64(n1, x);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = x[0];
|
maxf = x[0];
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -149,7 +137,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (x[i] > maxf) {
|
if (x[i] > maxf) {
|
||||||
maxf = x[i];
|
maxf = x[i];
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,34 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x)
|
static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT min;
|
FLOAT min;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vl %%v0,0(%[x])\n\t"
|
||||||
"vl %%v0,0(%2) \n\t"
|
"srlg %[n],%[n],6\n\t"
|
||||||
"srlg %%r0,%1,6 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
"vl %%v24,128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24,128(%%r1,%2) \n\t"
|
"vl %%v25,144(%%r1,%[x])\n\t"
|
||||||
"vl %%v25,144(%%r1,%2) \n\t"
|
"vl %%v26,160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26,160(%%r1,%2) \n\t"
|
"vl %%v27,176(%%r1,%[x])\n\t"
|
||||||
"vl %%v27,176(%%r1,%2) \n\t"
|
"vl %%v28,192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28,192(%%r1,%2) \n\t"
|
"vl %%v29,208(%%r1,%[x])\n\t"
|
||||||
"vl %%v29,208(%%r1,%2) \n\t"
|
"vl %%v30,224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30,224(%%r1,%2) \n\t"
|
"vl %%v31,240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31,240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfminsb %%v16,%%v16,%%v24,0\n\t"
|
"vfminsb %%v16,%%v16,%%v24,0\n\t"
|
||||||
"vfminsb %%v17,%%v17,%%v25,0\n\t"
|
"vfminsb %%v17,%%v17,%%v25,0\n\t"
|
||||||
"vfminsb %%v18,%%v18,%%v26,0\n\t"
|
"vfminsb %%v18,%%v18,%%v26,0\n\t"
|
||||||
|
|
@ -63,32 +59,25 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x)
|
||||||
"vfminsb %%v21,%%v21,%%v29,0\n\t"
|
"vfminsb %%v21,%%v21,%%v29,0\n\t"
|
||||||
"vfminsb %%v22,%%v22,%%v30,0\n\t"
|
"vfminsb %%v22,%%v22,%%v30,0\n\t"
|
||||||
"vfminsb %%v23,%%v23,%%v31,0\n\t"
|
"vfminsb %%v23,%%v23,%%v31,0\n\t"
|
||||||
|
|
||||||
"vfminsb %%v16,%%v16,%%v20,0\n\t"
|
"vfminsb %%v16,%%v16,%%v20,0\n\t"
|
||||||
"vfminsb %%v17,%%v17,%%v21,0\n\t"
|
"vfminsb %%v17,%%v17,%%v21,0\n\t"
|
||||||
"vfminsb %%v18,%%v18,%%v22,0\n\t"
|
"vfminsb %%v18,%%v18,%%v22,0\n\t"
|
||||||
"vfminsb %%v19,%%v19,%%v23,0\n\t"
|
"vfminsb %%v19,%%v19,%%v23,0\n\t"
|
||||||
|
|
||||||
"vfminsb %%v16,%%v16,%%v18,0\n\t"
|
"vfminsb %%v16,%%v16,%%v18,0\n\t"
|
||||||
"vfminsb %%v17,%%v17,%%v19,0\n\t"
|
"vfminsb %%v17,%%v17,%%v19,0\n\t"
|
||||||
|
|
||||||
"vfminsb %%v16,%%v16,%%v17,0\n\t"
|
"vfminsb %%v16,%%v16,%%v17,0\n\t"
|
||||||
|
|
||||||
"vfminsb %%v0,%%v0,%%v16,0\n\t"
|
"vfminsb %%v0,%%v0,%%v16,0\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"veslg %%v16,%%v0,32\n\t"
|
"veslg %%v16,%%v0,32\n\t"
|
||||||
"vfminsb %%v0,%%v0,%%v16,0\n\t"
|
"vfminsb %%v0,%%v0,%%v16,0\n\t"
|
||||||
|
|
||||||
"vrepf %%v16,%%v0,2\n\t"
|
"vrepf %%v16,%%v0,2\n\t"
|
||||||
"wfminsb %%v0,%%v0,%%v16,0\n\t"
|
"wfminsb %%v0,%%v0,%%v16,0\n\t"
|
||||||
"ler %0,%%f0 "
|
"ler %[min],%%f0"
|
||||||
:"=f"(min)
|
: [min] "=f"(min),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return min;
|
return min;
|
||||||
}
|
}
|
||||||
|
|
@ -98,7 +87,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
BLASLONG j = 0;
|
BLASLONG j = 0;
|
||||||
FLOAT minf = 0.0;
|
FLOAT minf = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (minf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (minf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -108,9 +98,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
minf = smin_kernel_64(n1, x);
|
minf = smin_kernel_64(n1, x);
|
||||||
|
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = x[0];
|
minf = x[0];
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
@ -149,7 +137,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (j < n) {
|
while (j < n) {
|
||||||
if (x[i] < minf) {
|
if (x[i] < minf) {
|
||||||
minf = x[i];
|
minf = x[i];
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,25 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
|
||||||
{
|
__asm__("vlrepf %%v0,%[c]\n\t"
|
||||||
__asm__ (
|
"vlrepf %%v1,%[s]\n\t"
|
||||||
"vlrepf %%v0,%3 \n\t"
|
"srlg %[n],%[n],6\n\t"
|
||||||
"vlrepf %%v1,%4 \n\t"
|
|
||||||
"srlg %%r0,%0,6 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
"vl %%v24, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
"vl %%v25, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
"vl %%v26, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
"vl %%v27, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
"vl %%v16, 0(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
"vl %%v17, 16(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
"vl %%v18, 32(%%r1,%[y])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
"vl %%v19, 48(%%r1,%[y])\n\t"
|
||||||
|
|
||||||
"vfmsb %%v28,%%v24,%%v0\n\t"
|
"vfmsb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmsb %%v29,%%v25,%%v0\n\t"
|
"vfmsb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -63,25 +60,22 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 0(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
"vst %%v29, 16(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
"vst %%v30, 32(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
"vst %%v31, 48(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
"vst %%v20, 0(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
"vst %%v21, 16(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
"vst %%v22, 32(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
"vst %%v23, 48(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
"vl %%v24, 64(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
"vl %%v26, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
"vl %%v27, 112(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
"vl %%v16, 64(%%r1,%[y])\n\t"
|
||||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
"vl %%v17, 80(%%r1,%[y])\n\t"
|
||||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
"vl %%v18, 96(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
"vl %%v19, 112(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
|
||||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmsb %%v28,%%v24,%%v0\n\t"
|
"vfmsb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmsb %%v29,%%v25,%%v0\n\t"
|
"vfmsb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -99,25 +93,22 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 64(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
"vst %%v29, 80(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
"vst %%v30, 96(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
"vst %%v31, 112(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
"vst %%v20, 64(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
"vst %%v21, 80(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
"vst %%v22, 96(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
"vst %%v23, 112(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
"vl %%v24, 128(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
"vl %%v26, 160(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
"vl %%v27, 176(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
"vl %%v16, 128(%%r1,%[y])\n\t"
|
||||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
"vl %%v17, 144(%%r1,%[y])\n\t"
|
||||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
"vl %%v18, 160(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
"vl %%v19, 176(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
|
||||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmsb %%v28,%%v24,%%v0\n\t"
|
"vfmsb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmsb %%v29,%%v25,%%v0\n\t"
|
"vfmsb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -135,25 +126,22 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 128(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
"vst %%v29, 144(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
"vst %%v30, 160(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
"vst %%v31, 176(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
"vst %%v20, 128(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
"vst %%v21, 144(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
"vst %%v22, 160(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
"vst %%v23, 176(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
"vl %%v24, 192(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25, 208(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
"vl %%v26, 224(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
"vl %%v27, 240(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
"vl %%v16, 192(%%r1,%[y])\n\t"
|
||||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
"vl %%v17, 208(%%r1,%[y])\n\t"
|
||||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
"vl %%v18, 224(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
"vl %%v19, 240(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
|
||||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmsb %%v28,%%v24,%%v0\n\t"
|
"vfmsb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmsb %%v29,%%v25,%%v0\n\t"
|
"vfmsb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -171,39 +159,38 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmasb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 192(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
"vst %%v29, 208(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
"vst %%v30, 224(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
"vst %%v31, 240(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
"vst %%v20, 192(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
"vst %%v21, 208(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
"vst %%v22, 224(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
"vst %%v23, 240(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
|
||||||
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
|
[n] "+&r"(n)
|
||||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||||
{
|
FLOAT c, FLOAT s) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
|
|
||||||
FLOAT temp;
|
FLOAT temp;
|
||||||
|
|
||||||
if ( n <= 0 ) return(0);
|
if (n <= 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1) )
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -64;
|
BLASLONG n1 = n & -64;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
FLOAT cosa, sina;
|
FLOAT cosa, sina;
|
||||||
cosa = c;
|
cosa = c;
|
||||||
sina = s;
|
sina = s;
|
||||||
|
|
@ -211,8 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
temp = c * x[i] + s * y[i];
|
temp = c * x[i] + s * y[i];
|
||||||
y[i] = c * y[i] - s * x[i];
|
y[i] = c * y[i] - s * x[i];
|
||||||
x[i] = temp;
|
x[i] = temp;
|
||||||
|
|
@ -221,13 +207,9 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
while (i < n) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
while(i < n)
|
|
||||||
{
|
|
||||||
temp = c * x[ix] + s * y[iy];
|
temp = c * x[ix] + s * y[iy];
|
||||||
y[iy] = c * y[iy] - s * x[ix];
|
y[iy] = c * y[iy] - s * x[ix];
|
||||||
x[ix] = temp;
|
x[ix] = temp;
|
||||||
|
|
@ -242,5 +224,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,128 +27,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x)
|
static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) {
|
||||||
{
|
__asm__("vlrepf %%v0,%[da]\n\t"
|
||||||
__asm__ volatile (
|
"srlg %[n],%[n],5\n\t"
|
||||||
"vlrepf %%v0,%1 \n\t"
|
|
||||||
"srlg %%r0,%0,5 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 0(%%r1,%2) \n\t"
|
"vl %%v24,0(%%r1,%[x])\n\t"
|
||||||
"vfmsb %%v24,%%v24,%%v0\n\t"
|
"vfmsb %%v24,%%v24,%%v0\n\t"
|
||||||
"vst %%v24, 0(%%r1,%2) \n\t"
|
"vst %%v24,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 16(%%r1,%2) \n\t"
|
"vl %%v25,16(%%r1,%[x])\n\t"
|
||||||
"vfmsb %%v25,%%v25,%%v0\n\t"
|
"vfmsb %%v25,%%v25,%%v0\n\t"
|
||||||
"vst %%v25, 16(%%r1,%2) \n\t"
|
"vst %%v25,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 32(%%r1,%2) \n\t"
|
"vl %%v26,32(%%r1,%[x])\n\t"
|
||||||
"vfmsb %%v26,%%v26,%%v0\n\t"
|
"vfmsb %%v26,%%v26,%%v0\n\t"
|
||||||
"vst %%v26, 32(%%r1,%2) \n\t"
|
"vst %%v26,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v27, 48(%%r1,%2) \n\t"
|
"vl %%v27,48(%%r1,%[x])\n\t"
|
||||||
"vfmsb %%v27,%%v27,%%v0\n\t"
|
"vfmsb %%v27,%%v27,%%v0\n\t"
|
||||||
"vst %%v27, 48(%%r1,%2) \n\t"
|
"vst %%v27,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 64(%%r1,%2) \n\t"
|
"vl %%v28,64(%%r1,%[x])\n\t"
|
||||||
"vfmsb %%v24,%%v24,%%v0 \n\t"
|
"vfmsb %%v28,%%v28,%%v0\n\t"
|
||||||
"vst %%v24, 64(%%r1,%2) \n\t"
|
"vst %%v28,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 80(%%r1,%2) \n\t"
|
"vl %%v29,80(%%r1,%[x])\n\t"
|
||||||
"vfmsb %%v25,%%v25,%%v0 \n\t"
|
"vfmsb %%v29,%%v29,%%v0\n\t"
|
||||||
"vst %%v25, 80(%%r1,%2) \n\t"
|
"vst %%v29,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 96(%%r1,%2) \n\t"
|
"vl %%v30,96(%%r1,%[x])\n\t"
|
||||||
"vfmsb %%v26,%%v26,%%v0 \n\t"
|
"vfmsb %%v30,%%v30,%%v0\n\t"
|
||||||
"vst %%v26, 96(%%r1,%2) \n\t"
|
"vst %%v30,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v27, 112(%%r1,%2) \n\t"
|
"vl %%v31,112(%%r1,%[x])\n\t"
|
||||||
"vfmsb %%v27,%%v27,%%v0 \n\t"
|
"vfmsb %%v31,%%v31,%%v0\n\t"
|
||||||
"vst %%v27, 112(%%r1,%2) \n\t"
|
"vst %%v31,112(%%r1,%[x])\n\t"
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
|
||||||
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
|
: [x] "a"(x),[da] "Q"(da)
|
||||||
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
|
: "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
);
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x)
|
static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) {
|
||||||
{
|
__asm__("vzero %%v0\n\t"
|
||||||
__asm__ volatile(
|
"srlg %[n],%[n],5\n\t"
|
||||||
"vzero %%v24 \n\t"
|
|
||||||
"vzero %%v25 \n\t"
|
|
||||||
"vzero %%v26 \n\t"
|
|
||||||
"vzero %%v27 \n\t"
|
|
||||||
"srlg %%r0,%0,5 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vst %%v0,0(%%r1,%[x])\n\t"
|
||||||
"vst %%v24,0(%%r1,%1) \n\t"
|
"vst %%v0,16(%%r1,%[x])\n\t"
|
||||||
"vst %%v25,16(%%r1,%1) \n\t"
|
"vst %%v0,32(%%r1,%[x])\n\t"
|
||||||
"vst %%v26,32(%%r1,%1) \n\t"
|
"vst %%v0,48(%%r1,%[x])\n\t"
|
||||||
"vst %%v27,48(%%r1,%1) \n\t"
|
"vst %%v0,64(%%r1,%[x])\n\t"
|
||||||
"vst %%v24,64(%%r1,%1) \n\t"
|
"vst %%v0,80(%%r1,%[x])\n\t"
|
||||||
"vst %%v25,80(%%r1,%1) \n\t"
|
"vst %%v0,96(%%r1,%[x])\n\t"
|
||||||
"vst %%v26,96(%%r1,%1) \n\t"
|
"vst %%v0,112(%%r1,%[x])\n\t"
|
||||||
"vst %%v27,112(%%r1,%1) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((FLOAT (*)[n])x)
|
: [x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v24","v25","v26","v27"
|
: "cc", "r1", "v0");
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
|
||||||
{
|
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||||
|
BLASLONG dummy2) {
|
||||||
BLASLONG i = 0, j = 0;
|
BLASLONG i = 0, j = 0;
|
||||||
if (n <= 0 || inc_x <= 0)
|
if (n <= 0 || inc_x <= 0)
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
if (inc_x == 1) {
|
||||||
|
|
||||||
if ( inc_x == 1 )
|
if (da == 0.0) {
|
||||||
{
|
|
||||||
|
|
||||||
if ( da == 0.0 )
|
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -32;
|
BLASLONG n1 = n & -32;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
|
|
||||||
sscal_kernel_32_zero(n1, x);
|
sscal_kernel_32_zero(n1, x);
|
||||||
j = n1;
|
j = n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(j < n)
|
while (j < n) {
|
||||||
{
|
|
||||||
|
|
||||||
x[j] = 0.0;
|
x[j] = 0.0;
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -32;
|
BLASLONG n1 = n & -32;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
sscal_kernel_32(n1, da, x);
|
sscal_kernel_32(n1, da, x);
|
||||||
j = n1;
|
j = n1;
|
||||||
}
|
}
|
||||||
while(j < n)
|
while (j < n) {
|
||||||
{
|
|
||||||
|
|
||||||
x[j] = da * x[j];
|
x[j] = da * x[j];
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
if (da == 0.0) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
if ( da == 0.0 )
|
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -2;
|
BLASLONG n1 = n & -2;
|
||||||
|
|
||||||
|
|
@ -161,17 +139,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
j += 2;
|
j += 2;
|
||||||
|
|
||||||
}
|
}
|
||||||
while(j < n)
|
while (j < n) {
|
||||||
{
|
|
||||||
|
|
||||||
x[i] = 0.0;
|
x[i] = 0.0;
|
||||||
i += inc_x;
|
i += inc_x;
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
BLASLONG n1 = n & -2;
|
BLASLONG n1 = n & -2;
|
||||||
|
|
||||||
while (j < n1) {
|
while (j < n1) {
|
||||||
|
|
@ -184,8 +159,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while(j < n)
|
while (j < n) {
|
||||||
{
|
|
||||||
|
|
||||||
x[i] = da * x[i];
|
x[i] = da * x[i];
|
||||||
i += inc_x;
|
i += inc_x;
|
||||||
|
|
@ -197,5 +171,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,111 +27,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
|
static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||||
{
|
__asm__("srlg %[n],%[n],6\n\t"
|
||||||
__asm__ volatile(
|
|
||||||
"srlg %%r0,%0,6 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
"vl %%v24, 128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
"vl %%v25, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
"vl %%v26, 160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
"vl %%v27, 176(%%r1,%[x])\n\t"
|
||||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
"vl %%v28, 192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
"vl %%v29, 208(%%r1,%[x])\n\t"
|
||||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
"vl %%v30, 224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
"vl %%v31, 240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
"vl %%v0, 0(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v1, 16(%%r1,%[y])\n\t"
|
||||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
"vl %%v2, 32(%%r1,%[y])\n\t"
|
||||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
"vl %%v3, 48(%%r1,%[y])\n\t"
|
||||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
"vl %%v4, 64(%%r1,%[y])\n\t"
|
||||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
"vl %%v5, 80(%%r1,%[y])\n\t"
|
||||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
"vl %%v6, 96(%%r1,%[y])\n\t"
|
||||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
"vl %%v7, 112(%%r1,%[y])\n\t"
|
||||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
"vst %%v0, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
"vst %%v1, 16(%%r1,%[x])\n\t"
|
||||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
"vst %%v2, 32(%%r1,%[x])\n\t"
|
||||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
"vst %%v3, 48(%%r1,%[x])\n\t"
|
||||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
"vst %%v4, 64(%%r1,%[x])\n\t"
|
||||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
"vst %%v5, 80(%%r1,%[x])\n\t"
|
||||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
"vst %%v6, 96(%%r1,%[x])\n\t"
|
||||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
"vst %%v7, 112(%%r1,%[x])\n\t"
|
||||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
"vl %%v0, 128(%%r1,%[y])\n\t"
|
||||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
"vl %%v1, 144(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v2, 160(%%r1,%[y])\n\t"
|
||||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
"vl %%v3, 176(%%r1,%[y])\n\t"
|
||||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
"vl %%v4, 192(%%r1,%[y])\n\t"
|
||||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
"vl %%v5, 208(%%r1,%[y])\n\t"
|
||||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
"vl %%v6, 224(%%r1,%[y])\n\t"
|
||||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
"vl %%v7, 240(%%r1,%[y])\n\t"
|
||||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
"vst %%v0, 128(%%r1,%[x])\n\t"
|
||||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
"vst %%v1, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
"vst %%v2, 160(%%r1,%[x])\n\t"
|
||||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
"vst %%v3, 176(%%r1,%[x])\n\t"
|
||||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
"vst %%v4, 192(%%r1,%[x])\n\t"
|
||||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
"vst %%v5, 208(%%r1,%[x])\n\t"
|
||||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
"vst %%v6, 224(%%r1,%[x])\n\t"
|
||||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
"vst %%v7, 240(%%r1,%[x])\n\t"
|
||||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
"vst %%v16, 0(%%r1,%[y])\n\t"
|
||||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
"vst %%v17, 16(%%r1,%[y])\n\t"
|
||||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
"vst %%v18, 32(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v19, 48(%%r1,%[y])\n\t"
|
||||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
"vst %%v20, 64(%%r1,%[y])\n\t"
|
||||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
"vst %%v21, 80(%%r1,%[y])\n\t"
|
||||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
"vst %%v22, 96(%%r1,%[y])\n\t"
|
||||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
"vst %%v23, 112(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
"vst %%v24, 128(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
"vst %%v25, 144(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
"vst %%v26, 160(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
"vst %%v27, 176(%%r1,%[y])\n\t"
|
||||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
"vst %%v28, 192(%%r1,%[y])\n\t"
|
||||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
"vst %%v29, 208(%%r1,%[y])\n\t"
|
||||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
"vst %%v30, 224(%%r1,%[y])\n\t"
|
||||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
"vst %%v31, 240(%%r1,%[y])\n\t"
|
||||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
|
||||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
|
||||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
|
||||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y),
|
||||||
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
|
[n] "+&r"(n)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: [x] "a"(x),[y] "a"(y)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
||||||
{
|
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||||
|
BLASLONG dummy2) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
FLOAT temp;
|
FLOAT temp;
|
||||||
|
|
||||||
if ( n <= 0 ) return(0);
|
if (n <= 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -64;
|
BLASLONG n1 = n & -64;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
sswap_kernel_64(n1, x, y);
|
sswap_kernel_64(n1, x, y);
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
temp = y[i];
|
temp = y[i];
|
||||||
y[i] = x[i];
|
y[i] = x[i];
|
||||||
x[i] = temp;
|
x[i] = temp;
|
||||||
|
|
@ -139,13 +133,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
while (i < n) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
while(i < n)
|
|
||||||
{
|
|
||||||
temp = y[iy];
|
temp = y[iy];
|
||||||
y[iy] = x[ix];
|
y[iy] = x[ix];
|
||||||
x[ix] = temp;
|
x[ix] = temp;
|
||||||
|
|
@ -158,7 +148,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
||||||
}
|
}
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,64 +28,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
|
||||||
|
|
||||||
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
|
|
||||||
{
|
|
||||||
FLOAT amax;
|
FLOAT amax;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vleg %%v0,0(%[x]),0\n\t"
|
||||||
"vleg %%v0,0(%2),0 \n\t"
|
"vleg %%v16,8(%[x]),0\n\t"
|
||||||
"vleg %%v16,8(%2),0 \n\t"
|
"vleg %%v0,16(%[x]),1\n\t"
|
||||||
"vleg %%v0,16(%2),1 \n\t"
|
"vleg %%v16,24(%[x]),1\n\t"
|
||||||
"vleg %%v16,24(%2),1 \n\t"
|
|
||||||
"vflpdb %%v0,%%v0\n\t"
|
"vflpdb %%v0,%%v0\n\t"
|
||||||
"vflpdb %%v16,%%v16\n\t"
|
"vflpdb %%v16,%%v16\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v16\n\t"
|
"vfadb %%v0,%%v0,%%v16\n\t"
|
||||||
"srlg %%r0,%1,4 \n\t"
|
"srlg %[n],%[n],4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vleg %%v16,0(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v16,0(%%r1,%2),0 \n\t"
|
"vleg %%v17,8(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v17,8(%%r1,%2),0 \n\t"
|
"vleg %%v16,16(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v16,16(%%r1,%2),1 \n\t"
|
"vleg %%v17,24(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v17,24(%%r1,%2),1 \n\t"
|
"vleg %%v18,32(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v18,32(%%r1,%2),0 \n\t"
|
"vleg %%v19,40(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v19,40(%%r1,%2),0 \n\t"
|
"vleg %%v18,48(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v18,48(%%r1,%2),1 \n\t"
|
"vleg %%v19,56(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v19,56(%%r1,%2),1 \n\t"
|
"vleg %%v20,64(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v20,64(%%r1,%2),0 \n\t"
|
"vleg %%v21,72(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v21,72(%%r1,%2),0 \n\t"
|
"vleg %%v20,80(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v20,80(%%r1,%2),1 \n\t"
|
"vleg %%v21,88(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v21,88(%%r1,%2),1 \n\t"
|
"vleg %%v22,96(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v22,96(%%r1,%2),0 \n\t"
|
"vleg %%v23,104(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v23,104(%%r1,%2),0 \n\t"
|
"vleg %%v22,112(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v22,112(%%r1,%2),1 \n\t"
|
"vleg %%v23,120(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v23,120(%%r1,%2),1 \n\t"
|
"vleg %%v24,128(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v24,128(%%r1,%2),0 \n\t"
|
"vleg %%v25,136(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v25,136(%%r1,%2),0 \n\t"
|
"vleg %%v24,144(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v24,144(%%r1,%2),1 \n\t"
|
"vleg %%v25,152(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v25,152(%%r1,%2),1 \n\t"
|
"vleg %%v26,160(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v26,160(%%r1,%2),0 \n\t"
|
"vleg %%v27,168(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v27,168(%%r1,%2),0 \n\t"
|
"vleg %%v26,176(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v26,176(%%r1,%2),1 \n\t"
|
"vleg %%v27,184(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v27,184(%%r1,%2),1 \n\t"
|
"vleg %%v28,192(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v28,192(%%r1,%2),0 \n\t"
|
"vleg %%v29,200(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v29,200(%%r1,%2),0 \n\t"
|
"vleg %%v28,208(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v28,208(%%r1,%2),1 \n\t"
|
"vleg %%v29,216(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v29,216(%%r1,%2),1 \n\t"
|
"vleg %%v30,224(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v30,224(%%r1,%2),0 \n\t"
|
"vleg %%v31,232(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v31,232(%%r1,%2),0 \n\t"
|
"vleg %%v30,240(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v30,240(%%r1,%2),1 \n\t"
|
"vleg %%v31,248(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v31,248(%%r1,%2),1 \n\t"
|
|
||||||
|
|
||||||
"vflpdb %%v16,%%v16\n\t"
|
"vflpdb %%v16,%%v16\n\t"
|
||||||
"vflpdb %%v17,%%v17\n\t"
|
"vflpdb %%v17,%%v17\n\t"
|
||||||
"vflpdb %%v18,%%v18\n\t"
|
"vflpdb %%v18,%%v18\n\t"
|
||||||
|
|
@ -102,7 +92,6 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
|
||||||
"vflpdb %%v29,%%v29\n\t"
|
"vflpdb %%v29,%%v29\n\t"
|
||||||
"vflpdb %%v30,%%v30\n\t"
|
"vflpdb %%v30,%%v30\n\t"
|
||||||
"vflpdb %%v31,%%v31\n\t"
|
"vflpdb %%v31,%%v31\n\t"
|
||||||
|
|
||||||
"vfadb %%v16,%%v16,%%v17\n\t"
|
"vfadb %%v16,%%v16,%%v17\n\t"
|
||||||
"vfadb %%v18,%%v18,%%v19\n\t"
|
"vfadb %%v18,%%v18,%%v19\n\t"
|
||||||
"vfadb %%v20,%%v20,%%v21\n\t"
|
"vfadb %%v20,%%v20,%%v21\n\t"
|
||||||
|
|
@ -111,29 +100,23 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
|
||||||
"vfadb %%v26,%%v26,%%v27\n\t"
|
"vfadb %%v26,%%v26,%%v27\n\t"
|
||||||
"vfadb %%v28,%%v28,%%v29\n\t"
|
"vfadb %%v28,%%v28,%%v29\n\t"
|
||||||
"vfadb %%v30,%%v30,%%v31\n\t"
|
"vfadb %%v30,%%v30,%%v31\n\t"
|
||||||
|
|
||||||
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
|
"vfmaxdb %%v16,%%v16,%%v24,0\n\t"
|
||||||
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
|
"vfmaxdb %%v18,%%v18,%%v26,0\n\t"
|
||||||
"vfmaxdb %%v20,%%v20,%%v28,0\n\t"
|
"vfmaxdb %%v20,%%v20,%%v28,0\n\t"
|
||||||
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
|
"vfmaxdb %%v22,%%v22,%%v30,0\n\t"
|
||||||
|
|
||||||
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
|
"vfmaxdb %%v16,%%v16,%%v20,0\n\t"
|
||||||
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
|
"vfmaxdb %%v18,%%v18,%%v22,0\n\t"
|
||||||
|
|
||||||
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
|
"vfmaxdb %%v16,%%v16,%%v18,0\n\t"
|
||||||
|
|
||||||
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
|
"vfmaxdb %%v0,%%v0,%%v16,0\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v16,%%v0,1\n\t"
|
"vrepg %%v16,%%v0,1\n\t"
|
||||||
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
|
"wfmaxdb %%v0,%%v0,%%v16,0\n\t"
|
||||||
"ldr %0,%%f0 "
|
"ldr %[amax],%%f0"
|
||||||
:"=f"(amax)
|
: [amax] "=f"(amax),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return amax;
|
return amax;
|
||||||
}
|
}
|
||||||
|
|
@ -144,7 +127,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT maxf = 0.0;
|
FLOAT maxf = 0.0;
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (maxf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -154,9 +138,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
maxf = zamax_kernel_16(n1, x);
|
maxf = zamax_kernel_16(n1, x);
|
||||||
ix = n1 * 2;
|
ix = n1 * 2;
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = CABS1(x, 0);
|
maxf = CABS1(x, 0);
|
||||||
ix += 2;
|
ix += 2;
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -198,7 +180,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (i < n) {
|
while (i < n) {
|
||||||
if (CABS1(x, ix) > maxf) {
|
if (CABS1(x, ix) > maxf) {
|
||||||
maxf = CABS1(x, ix);
|
maxf = CABS1(x, ix);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,47 +28,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) {
|
||||||
|
|
||||||
static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
|
|
||||||
{
|
|
||||||
FLOAT amax;
|
FLOAT amax;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vleg %%v0,0(%[x]),0\n\t"
|
||||||
"vleg %%v0,0(%2),0 \n\t"
|
"vleg %%v16,8(%[x]),0\n\t"
|
||||||
"vleg %%v16,8(%2),0 \n\t"
|
"vleg %%v0,16(%[x]),1\n\t"
|
||||||
"vleg %%v0,16(%2),1 \n\t"
|
"vleg %%v16,24(%[x]),1\n\t"
|
||||||
"vleg %%v16,24(%2),1 \n\t"
|
|
||||||
"vflpdb %%v0,%%v0\n\t"
|
"vflpdb %%v0,%%v0\n\t"
|
||||||
"vflpdb %%v16,%%v16\n\t"
|
"vflpdb %%v16,%%v16\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v16\n\t"
|
"vfadb %%v0,%%v0,%%v16\n\t"
|
||||||
"srlg %%r0,%1,4 \n\t"
|
"srlg %[n],%[n],4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vleg %%v16,0(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v16,0(%%r1,%2),0 \n\t"
|
"vleg %%v17,8(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v17,8(%%r1,%2),0 \n\t"
|
"vleg %%v16,16(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v16,16(%%r1,%2),1 \n\t"
|
"vleg %%v17,24(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v17,24(%%r1,%2),1 \n\t"
|
"vleg %%v18,32(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v18,32(%%r1,%2),0 \n\t"
|
"vleg %%v19,40(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v19,40(%%r1,%2),0 \n\t"
|
"vleg %%v18,48(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v18,48(%%r1,%2),1 \n\t"
|
"vleg %%v19,56(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v19,56(%%r1,%2),1 \n\t"
|
"vleg %%v20,64(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v20,64(%%r1,%2),0 \n\t"
|
"vleg %%v21,72(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v21,72(%%r1,%2),0 \n\t"
|
"vleg %%v20,80(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v20,80(%%r1,%2),1 \n\t"
|
"vleg %%v21,88(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v21,88(%%r1,%2),1 \n\t"
|
"vleg %%v22,96(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v22,96(%%r1,%2),0 \n\t"
|
"vleg %%v23,104(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v23,104(%%r1,%2),0 \n\t"
|
"vleg %%v22,112(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v22,112(%%r1,%2),1 \n\t"
|
"vleg %%v23,120(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v23,120(%%r1,%2),1 \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -81,34 +72,30 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
|
||||||
"vfadb %%v17,%%v18,%%v19\n\t"
|
"vfadb %%v17,%%v18,%%v19\n\t"
|
||||||
"vfadb %%v18,%%v20,%%v21\n\t"
|
"vfadb %%v18,%%v20,%%v21\n\t"
|
||||||
"vfadb %%v19,%%v22,%%v23\n\t"
|
"vfadb %%v19,%%v22,%%v23\n\t"
|
||||||
|
|
||||||
"vfchdb %%v24,%%v16,%%v17\n\t"
|
"vfchdb %%v24,%%v16,%%v17\n\t"
|
||||||
"vfchdb %%v25,%%v18,%%v19\n\t"
|
"vfchdb %%v25,%%v18,%%v19\n\t"
|
||||||
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
|
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
|
||||||
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
||||||
|
|
||||||
"vfchdb %%v26,%%v24,%%v25\n\t"
|
"vfchdb %%v26,%%v24,%%v25\n\t"
|
||||||
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
|
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
|
||||||
|
|
||||||
"vfchdb %%v27,%%v26,%%v0\n\t"
|
"vfchdb %%v27,%%v26,%%v0\n\t"
|
||||||
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
|
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
|
||||||
|
"vleg %%v16,128(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v16,128(%%r1,%2),0 \n\t"
|
"vleg %%v17,136(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v17,136(%%r1,%2),0 \n\t"
|
"vleg %%v16,144(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v16,144(%%r1,%2),1 \n\t"
|
"vleg %%v17,152(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v17,152(%%r1,%2),1 \n\t"
|
"vleg %%v18,160(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v18,160(%%r1,%2),0 \n\t"
|
"vleg %%v19,168(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v19,168(%%r1,%2),0 \n\t"
|
"vleg %%v18,176(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v18,176(%%r1,%2),1 \n\t"
|
"vleg %%v19,184(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v19,184(%%r1,%2),1 \n\t"
|
"vleg %%v20,192(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v20,192(%%r1,%2),0 \n\t"
|
"vleg %%v21,200(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v21,200(%%r1,%2),0 \n\t"
|
"vleg %%v20,208(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v20,208(%%r1,%2),1 \n\t"
|
"vleg %%v21,216(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v21,216(%%r1,%2),1 \n\t"
|
"vleg %%v22,224(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v22,224(%%r1,%2),0 \n\t"
|
"vleg %%v23,232(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v23,232(%%r1,%2),0 \n\t"
|
"vleg %%v22,240(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v22,240(%%r1,%2),1 \n\t"
|
"vleg %%v23,248(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v23,248(%%r1,%2),1 \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -121,29 +108,24 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
|
||||||
"vfadb %%v17,%%v18,%%v19\n\t"
|
"vfadb %%v17,%%v18,%%v19\n\t"
|
||||||
"vfadb %%v18,%%v20,%%v21\n\t"
|
"vfadb %%v18,%%v20,%%v21\n\t"
|
||||||
"vfadb %%v19,%%v22,%%v23\n\t"
|
"vfadb %%v19,%%v22,%%v23\n\t"
|
||||||
|
|
||||||
"vfchdb %%v24,%%v16,%%v17\n\t"
|
"vfchdb %%v24,%%v16,%%v17\n\t"
|
||||||
"vfchdb %%v25,%%v18,%%v19\n\t"
|
"vfchdb %%v25,%%v18,%%v19\n\t"
|
||||||
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
|
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
|
||||||
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
||||||
|
|
||||||
"vfchdb %%v26,%%v24,%%v25\n\t"
|
"vfchdb %%v26,%%v24,%%v25\n\t"
|
||||||
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
|
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
|
||||||
|
|
||||||
"vfchdb %%v27,%%v26,%%v0\n\t"
|
"vfchdb %%v27,%%v26,%%v0\n\t"
|
||||||
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
|
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v16,%%v0,1\n\t"
|
"vrepg %%v16,%%v0,1\n\t"
|
||||||
"wfchdb %%v17,%%v0,%%v16\n\t"
|
"wfchdb %%v17,%%v0,%%v16\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
|
||||||
"ldr %0,%%f0 "
|
"ldr %[amax],%%f0"
|
||||||
:"=f"(amax)
|
: [amax] "=f"(amax),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27");
|
||||||
|
|
||||||
return amax;
|
return amax;
|
||||||
}
|
}
|
||||||
|
|
@ -154,7 +136,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT maxf = 0.0;
|
FLOAT maxf = 0.0;
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (maxf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (maxf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -164,9 +147,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
maxf = zamax_kernel_16(n1, x);
|
maxf = zamax_kernel_16(n1, x);
|
||||||
ix = n1 * 2;
|
ix = n1 * 2;
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
maxf = CABS1(x, 0);
|
maxf = CABS1(x, 0);
|
||||||
ix += 2;
|
ix += 2;
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -208,7 +189,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (i < n) {
|
while (i < n) {
|
||||||
if (CABS1(x, ix) > maxf) {
|
if (CABS1(x, ix) > maxf) {
|
||||||
maxf = CABS1(x, ix);
|
maxf = CABS1(x, ix);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,64 +28,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
|
||||||
|
|
||||||
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
|
|
||||||
{
|
|
||||||
FLOAT amin;
|
FLOAT amin;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vleg %%v0,0(%[x]),0\n\t"
|
||||||
"vleg %%v0,0(%2),0 \n\t"
|
"vleg %%v16,8(%[x]),0\n\t"
|
||||||
"vleg %%v16,8(%2),0 \n\t"
|
"vleg %%v0,16(%[x]),1\n\t"
|
||||||
"vleg %%v0,16(%2),1 \n\t"
|
"vleg %%v16,24(%[x]),1\n\t"
|
||||||
"vleg %%v16,24(%2),1 \n\t"
|
|
||||||
"vflpdb %%v0,%%v0\n\t"
|
"vflpdb %%v0,%%v0\n\t"
|
||||||
"vflpdb %%v16,%%v16\n\t"
|
"vflpdb %%v16,%%v16\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v16\n\t"
|
"vfadb %%v0,%%v0,%%v16\n\t"
|
||||||
"srlg %%r0,%1,4 \n\t"
|
"srlg %[n],%[n],4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vleg %%v16,0(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v16,0(%%r1,%2),0 \n\t"
|
"vleg %%v17,8(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v17,8(%%r1,%2),0 \n\t"
|
"vleg %%v16,16(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v16,16(%%r1,%2),1 \n\t"
|
"vleg %%v17,24(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v17,24(%%r1,%2),1 \n\t"
|
"vleg %%v18,32(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v18,32(%%r1,%2),0 \n\t"
|
"vleg %%v19,40(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v19,40(%%r1,%2),0 \n\t"
|
"vleg %%v18,48(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v18,48(%%r1,%2),1 \n\t"
|
"vleg %%v19,56(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v19,56(%%r1,%2),1 \n\t"
|
"vleg %%v20,64(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v20,64(%%r1,%2),0 \n\t"
|
"vleg %%v21,72(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v21,72(%%r1,%2),0 \n\t"
|
"vleg %%v20,80(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v20,80(%%r1,%2),1 \n\t"
|
"vleg %%v21,88(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v21,88(%%r1,%2),1 \n\t"
|
"vleg %%v22,96(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v22,96(%%r1,%2),0 \n\t"
|
"vleg %%v23,104(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v23,104(%%r1,%2),0 \n\t"
|
"vleg %%v22,112(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v22,112(%%r1,%2),1 \n\t"
|
"vleg %%v23,120(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v23,120(%%r1,%2),1 \n\t"
|
"vleg %%v24,128(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v24,128(%%r1,%2),0 \n\t"
|
"vleg %%v25,136(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v25,136(%%r1,%2),0 \n\t"
|
"vleg %%v24,144(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v24,144(%%r1,%2),1 \n\t"
|
"vleg %%v25,152(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v25,152(%%r1,%2),1 \n\t"
|
"vleg %%v26,160(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v26,160(%%r1,%2),0 \n\t"
|
"vleg %%v27,168(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v27,168(%%r1,%2),0 \n\t"
|
"vleg %%v26,176(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v26,176(%%r1,%2),1 \n\t"
|
"vleg %%v27,184(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v27,184(%%r1,%2),1 \n\t"
|
"vleg %%v28,192(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v28,192(%%r1,%2),0 \n\t"
|
"vleg %%v29,200(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v29,200(%%r1,%2),0 \n\t"
|
"vleg %%v28,208(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v28,208(%%r1,%2),1 \n\t"
|
"vleg %%v29,216(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v29,216(%%r1,%2),1 \n\t"
|
"vleg %%v30,224(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v30,224(%%r1,%2),0 \n\t"
|
"vleg %%v31,232(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v31,232(%%r1,%2),0 \n\t"
|
"vleg %%v30,240(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v30,240(%%r1,%2),1 \n\t"
|
"vleg %%v31,248(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v31,248(%%r1,%2),1 \n\t"
|
|
||||||
|
|
||||||
"vflpdb %%v16,%%v16\n\t"
|
"vflpdb %%v16,%%v16\n\t"
|
||||||
"vflpdb %%v17,%%v17\n\t"
|
"vflpdb %%v17,%%v17\n\t"
|
||||||
"vflpdb %%v18,%%v18\n\t"
|
"vflpdb %%v18,%%v18\n\t"
|
||||||
|
|
@ -102,7 +92,6 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
|
||||||
"vflpdb %%v29,%%v29\n\t"
|
"vflpdb %%v29,%%v29\n\t"
|
||||||
"vflpdb %%v30,%%v30\n\t"
|
"vflpdb %%v30,%%v30\n\t"
|
||||||
"vflpdb %%v31,%%v31\n\t"
|
"vflpdb %%v31,%%v31\n\t"
|
||||||
|
|
||||||
"vfadb %%v16,%%v16,%%v17\n\t"
|
"vfadb %%v16,%%v16,%%v17\n\t"
|
||||||
"vfadb %%v18,%%v18,%%v19\n\t"
|
"vfadb %%v18,%%v18,%%v19\n\t"
|
||||||
"vfadb %%v20,%%v20,%%v21\n\t"
|
"vfadb %%v20,%%v20,%%v21\n\t"
|
||||||
|
|
@ -111,29 +100,23 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
|
||||||
"vfadb %%v26,%%v26,%%v27\n\t"
|
"vfadb %%v26,%%v26,%%v27\n\t"
|
||||||
"vfadb %%v28,%%v28,%%v29\n\t"
|
"vfadb %%v28,%%v28,%%v29\n\t"
|
||||||
"vfadb %%v30,%%v30,%%v31\n\t"
|
"vfadb %%v30,%%v30,%%v31\n\t"
|
||||||
|
|
||||||
"vfmindb %%v16,%%v16,%%v24,0\n\t"
|
"vfmindb %%v16,%%v16,%%v24,0\n\t"
|
||||||
"vfmindb %%v18,%%v18,%%v26,0\n\t"
|
"vfmindb %%v18,%%v18,%%v26,0\n\t"
|
||||||
"vfmindb %%v20,%%v20,%%v28,0\n\t"
|
"vfmindb %%v20,%%v20,%%v28,0\n\t"
|
||||||
"vfmindb %%v22,%%v22,%%v30,0\n\t"
|
"vfmindb %%v22,%%v22,%%v30,0\n\t"
|
||||||
|
|
||||||
"vfmindb %%v16,%%v16,%%v20,0\n\t"
|
"vfmindb %%v16,%%v16,%%v20,0\n\t"
|
||||||
"vfmindb %%v18,%%v18,%%v22,0\n\t"
|
"vfmindb %%v18,%%v18,%%v22,0\n\t"
|
||||||
|
|
||||||
"vfmindb %%v16,%%v16,%%v18,0\n\t"
|
"vfmindb %%v16,%%v16,%%v18,0\n\t"
|
||||||
|
|
||||||
"vfmindb %%v0,%%v0,%%v16,0\n\t"
|
"vfmindb %%v0,%%v0,%%v16,0\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v16,%%v0,1\n\t"
|
"vrepg %%v16,%%v0,1\n\t"
|
||||||
"wfmindb %%v0,%%v0,%%v16,0\n\t"
|
"wfmindb %%v0,%%v0,%%v16,0\n\t"
|
||||||
"ldr %0,%%f0 "
|
"ldr %[amin],%%f0"
|
||||||
:"=f"(amin)
|
: [amin] "=f"(amin),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return amin;
|
return amin;
|
||||||
}
|
}
|
||||||
|
|
@ -144,7 +127,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT minf = 0.0;
|
FLOAT minf = 0.0;
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (minf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (minf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -154,9 +138,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
minf = zamin_kernel_16(n1, x);
|
minf = zamin_kernel_16(n1, x);
|
||||||
ix = n1 * 2;
|
ix = n1 * 2;
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = CABS1(x, 0);
|
minf = CABS1(x, 0);
|
||||||
ix += 2;
|
ix += 2;
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -198,7 +180,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (i < n) {
|
while (i < n) {
|
||||||
if (CABS1(x, ix) < minf) {
|
if (CABS1(x, ix) < minf) {
|
||||||
minf = CABS1(x, ix);
|
minf = CABS1(x, ix);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,47 +28,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1]))
|
||||||
#define ABS fabs
|
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))
|
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) {
|
||||||
|
|
||||||
static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
|
|
||||||
{
|
|
||||||
FLOAT amin;
|
FLOAT amin;
|
||||||
|
|
||||||
__asm__ volatile (
|
__asm__("vleg %%v0,0(%[x]),0\n\t"
|
||||||
"vleg %%v0,0(%2),0 \n\t"
|
"vleg %%v16,8(%[x]),0\n\t"
|
||||||
"vleg %%v16,8(%2),0 \n\t"
|
"vleg %%v0,16(%[x]),1\n\t"
|
||||||
"vleg %%v0,16(%2),1 \n\t"
|
"vleg %%v16,24(%[x]),1\n\t"
|
||||||
"vleg %%v16,24(%2),1 \n\t"
|
|
||||||
"vflpdb %%v0,%%v0\n\t"
|
"vflpdb %%v0,%%v0\n\t"
|
||||||
"vflpdb %%v16,%%v16\n\t"
|
"vflpdb %%v16,%%v16\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v16\n\t"
|
"vfadb %%v0,%%v0,%%v16\n\t"
|
||||||
"srlg %%r0,%1,4 \n\t"
|
"srlg %[n],%[n],4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vleg %%v16,0(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v16,0(%%r1,%2),0 \n\t"
|
"vleg %%v17,8(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v17,8(%%r1,%2),0 \n\t"
|
"vleg %%v16,16(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v16,16(%%r1,%2),1 \n\t"
|
"vleg %%v17,24(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v17,24(%%r1,%2),1 \n\t"
|
"vleg %%v18,32(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v18,32(%%r1,%2),0 \n\t"
|
"vleg %%v19,40(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v19,40(%%r1,%2),0 \n\t"
|
"vleg %%v18,48(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v18,48(%%r1,%2),1 \n\t"
|
"vleg %%v19,56(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v19,56(%%r1,%2),1 \n\t"
|
"vleg %%v20,64(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v20,64(%%r1,%2),0 \n\t"
|
"vleg %%v21,72(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v21,72(%%r1,%2),0 \n\t"
|
"vleg %%v20,80(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v20,80(%%r1,%2),1 \n\t"
|
"vleg %%v21,88(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v21,88(%%r1,%2),1 \n\t"
|
"vleg %%v22,96(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v22,96(%%r1,%2),0 \n\t"
|
"vleg %%v23,104(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v23,104(%%r1,%2),0 \n\t"
|
"vleg %%v22,112(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v22,112(%%r1,%2),1 \n\t"
|
"vleg %%v23,120(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v23,120(%%r1,%2),1 \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -81,34 +72,30 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
|
||||||
"vfadb %%v17,%%v18,%%v19\n\t"
|
"vfadb %%v17,%%v18,%%v19\n\t"
|
||||||
"vfadb %%v18,%%v20,%%v21\n\t"
|
"vfadb %%v18,%%v20,%%v21\n\t"
|
||||||
"vfadb %%v19,%%v22,%%v23\n\t"
|
"vfadb %%v19,%%v22,%%v23\n\t"
|
||||||
|
|
||||||
"vfchdb %%v24,%%v17,%%v16\n\t"
|
"vfchdb %%v24,%%v17,%%v16\n\t"
|
||||||
"vfchdb %%v25,%%v19,%%v18\n\t"
|
"vfchdb %%v25,%%v19,%%v18\n\t"
|
||||||
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
|
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
|
||||||
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
||||||
|
|
||||||
"vfchdb %%v26,%%v25,%%v24\n\t"
|
"vfchdb %%v26,%%v25,%%v24\n\t"
|
||||||
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
|
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
|
||||||
|
|
||||||
"vfchdb %%v27,%%v0,%%v26\n\t"
|
"vfchdb %%v27,%%v0,%%v26\n\t"
|
||||||
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
|
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
|
||||||
|
"vleg %%v16,128(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v16,128(%%r1,%2),0 \n\t"
|
"vleg %%v17,136(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v17,136(%%r1,%2),0 \n\t"
|
"vleg %%v16,144(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v16,144(%%r1,%2),1 \n\t"
|
"vleg %%v17,152(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v17,152(%%r1,%2),1 \n\t"
|
"vleg %%v18,160(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v18,160(%%r1,%2),0 \n\t"
|
"vleg %%v19,168(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v19,168(%%r1,%2),0 \n\t"
|
"vleg %%v18,176(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v18,176(%%r1,%2),1 \n\t"
|
"vleg %%v19,184(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v19,184(%%r1,%2),1 \n\t"
|
"vleg %%v20,192(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v20,192(%%r1,%2),0 \n\t"
|
"vleg %%v21,200(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v21,200(%%r1,%2),0 \n\t"
|
"vleg %%v20,208(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v20,208(%%r1,%2),1 \n\t"
|
"vleg %%v21,216(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v21,216(%%r1,%2),1 \n\t"
|
"vleg %%v22,224(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v22,224(%%r1,%2),0 \n\t"
|
"vleg %%v23,232(%%r1,%[x]),0\n\t"
|
||||||
"vleg %%v23,232(%%r1,%2),0 \n\t"
|
"vleg %%v22,240(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v22,240(%%r1,%2),1 \n\t"
|
"vleg %%v23,248(%%r1,%[x]),1\n\t"
|
||||||
"vleg %%v23,248(%%r1,%2),1 \n\t"
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -121,29 +108,24 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
|
||||||
"vfadb %%v17,%%v18,%%v19\n\t"
|
"vfadb %%v17,%%v18,%%v19\n\t"
|
||||||
"vfadb %%v18,%%v20,%%v21\n\t"
|
"vfadb %%v18,%%v20,%%v21\n\t"
|
||||||
"vfadb %%v19,%%v22,%%v23\n\t"
|
"vfadb %%v19,%%v22,%%v23\n\t"
|
||||||
|
|
||||||
"vfchdb %%v24,%%v17,%%v16\n\t"
|
"vfchdb %%v24,%%v17,%%v16\n\t"
|
||||||
"vfchdb %%v25,%%v19,%%v18\n\t"
|
"vfchdb %%v25,%%v19,%%v18\n\t"
|
||||||
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
|
"vsel %%v24,%%v16,%%v17,%%v24\n\t"
|
||||||
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
"vsel %%v25,%%v18,%%v19,%%v25\n\t"
|
||||||
|
|
||||||
"vfchdb %%v26,%%v25,%%v24\n\t"
|
"vfchdb %%v26,%%v25,%%v24\n\t"
|
||||||
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
|
"vsel %%v26,%%v24,%%v25,%%v26\n\t"
|
||||||
|
|
||||||
"vfchdb %%v27,%%v0,%%v26\n\t"
|
"vfchdb %%v27,%%v0,%%v26\n\t"
|
||||||
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
|
"vsel %%v0,%%v26,%%v0,%%v27\n\t"
|
||||||
|
|
||||||
"agfi %%r1, 256\n\t"
|
"agfi %%r1, 256\n\t"
|
||||||
"brctg %%r0, 0b \n\t"
|
"brctg %[n], 0b\n\t"
|
||||||
|
|
||||||
"vrepg %%v16,%%v0,1\n\t"
|
"vrepg %%v16,%%v0,1\n\t"
|
||||||
"wfchdb %%v17,%%v16,%%v0\n\t"
|
"wfchdb %%v17,%%v16,%%v0\n\t"
|
||||||
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
|
"vsel %%v0,%%v0,%%v16,%%v17\n\t"
|
||||||
"ldr %0,%%f0 "
|
"ldr %[amin],%%f0"
|
||||||
:"=f"(amin)
|
: [amin] "=f"(amin),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n])x)
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
);
|
"v23", "v24", "v25", "v26", "v27");
|
||||||
|
|
||||||
return amin;
|
return amin;
|
||||||
}
|
}
|
||||||
|
|
@ -154,7 +136,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
FLOAT minf = 0.0;
|
FLOAT minf = 0.0;
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return (minf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (minf);
|
||||||
|
|
||||||
if (inc_x == 1) {
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
|
@ -164,9 +147,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
minf = zamin_kernel_16(n1, x);
|
minf = zamin_kernel_16(n1, x);
|
||||||
ix = n1 * 2;
|
ix = n1 * 2;
|
||||||
i = n1;
|
i = n1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
minf = CABS1(x, 0);
|
minf = CABS1(x, 0);
|
||||||
ix += 2;
|
ix += 2;
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -208,7 +189,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
while (i < n) {
|
while (i < n) {
|
||||||
if (CABS1(x, ix) < minf) {
|
if (CABS1(x, ix) < minf) {
|
||||||
minf = CABS1(x, ix);
|
minf = CABS1(x, ix);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -28,34 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(DOUBLE)
|
|
||||||
#define ABS fabs
|
#define ABS fabs
|
||||||
#else
|
|
||||||
#define ABS fabsf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
|
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
|
||||||
{
|
|
||||||
FLOAT asum;
|
FLOAT asum;
|
||||||
|
|
||||||
__asm__ (
|
__asm__("vzero %%v24\n\t"
|
||||||
"vzero %%v0 \n\t"
|
"vzero %%v25\n\t"
|
||||||
"vzero %%v1 \n\t"
|
"vzero %%v26\n\t"
|
||||||
"vzero %%v2 \n\t"
|
"vzero %%v27\n\t"
|
||||||
"vzero %%v3 \n\t"
|
"vzero %%v28\n\t"
|
||||||
"srlg %%r0,%1,4 \n\t"
|
"vzero %%v29\n\t"
|
||||||
|
"vzero %%v30\n\t"
|
||||||
|
"vzero %%v31\n\t"
|
||||||
|
"srlg %[n],%[n],4\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||||
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -64,25 +61,22 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
|
||||||
"vflpdb %%v21, %%v21\n\t"
|
"vflpdb %%v21, %%v21\n\t"
|
||||||
"vflpdb %%v22, %%v22\n\t"
|
"vflpdb %%v22, %%v22\n\t"
|
||||||
"vflpdb %%v23, %%v23\n\t"
|
"vflpdb %%v23, %%v23\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v16\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
"vfadb %%v25,%%v25,%%v17\n\t"
|
||||||
"vfadb %%v1,%%v1,%%v17 \n\t"
|
"vfadb %%v26,%%v26,%%v18\n\t"
|
||||||
"vfadb %%v2,%%v2,%%v18 \n\t"
|
"vfadb %%v27,%%v27,%%v19\n\t"
|
||||||
"vfadb %%v3,%%v3,%%v19 \n\t"
|
"vfadb %%v28,%%v28,%%v20\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v20 \n\t"
|
"vfadb %%v29,%%v29,%%v21\n\t"
|
||||||
"vfadb %%v1,%%v1,%%v21 \n\t"
|
"vfadb %%v30,%%v30,%%v22\n\t"
|
||||||
"vfadb %%v2,%%v2,%%v22 \n\t"
|
"vfadb %%v31,%%v31,%%v23\n\t"
|
||||||
"vfadb %%v3,%%v3,%%v23 \n\t"
|
"vl %%v16, 128(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v17, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
"vl %%v18, 160(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
"vl %%v19, 176(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
"vl %%v20, 192(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
"vl %%v21, 208(%%r1,%[x])\n\t"
|
||||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
"vl %%v22, 224(%%r1,%[x])\n\t"
|
||||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
"vl %%v23, 240(%%r1,%[x])\n\t"
|
||||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
|
||||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vflpdb %%v16, %%v16\n\t"
|
"vflpdb %%v16, %%v16\n\t"
|
||||||
"vflpdb %%v17, %%v17\n\t"
|
"vflpdb %%v17, %%v17\n\t"
|
||||||
"vflpdb %%v18, %%v18\n\t"
|
"vflpdb %%v18, %%v18\n\t"
|
||||||
|
|
@ -91,68 +85,64 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
|
||||||
"vflpdb %%v21, %%v21\n\t"
|
"vflpdb %%v21, %%v21\n\t"
|
||||||
"vflpdb %%v22, %%v22\n\t"
|
"vflpdb %%v22, %%v22\n\t"
|
||||||
"vflpdb %%v23, %%v23\n\t"
|
"vflpdb %%v23, %%v23\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v16\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v16 \n\t"
|
"vfadb %%v25,%%v25,%%v17\n\t"
|
||||||
"vfadb %%v1,%%v1,%%v17 \n\t"
|
"vfadb %%v26,%%v26,%%v18\n\t"
|
||||||
"vfadb %%v2,%%v2,%%v18 \n\t"
|
"vfadb %%v27,%%v27,%%v19\n\t"
|
||||||
"vfadb %%v3,%%v3,%%v19 \n\t"
|
"vfadb %%v28,%%v28,%%v20\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v20 \n\t"
|
"vfadb %%v29,%%v29,%%v21\n\t"
|
||||||
"vfadb %%v1,%%v1,%%v21 \n\t"
|
"vfadb %%v30,%%v30,%%v22\n\t"
|
||||||
"vfadb %%v2,%%v2,%%v22 \n\t"
|
"vfadb %%v31,%%v31,%%v23\n\t"
|
||||||
"vfadb %%v3,%%v3,%%v23 \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v1 \n\t"
|
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v2 \n\t"
|
"vfadb %%v24,%%v24,%%v26\n\t"
|
||||||
"vfadb %%v0,%%v0,%%v3 \n\t"
|
"vfadb %%v24,%%v24,%%v27\n\t"
|
||||||
"vrepg %%v1,%%v0,1 \n\t"
|
"vfadb %%v24,%%v24,%%v28\n\t"
|
||||||
"adbr %%f0,%%f1 \n\t"
|
"vfadb %%v24,%%v24,%%v29\n\t"
|
||||||
"ldr %0,%%f0 "
|
"vfadb %%v24,%%v24,%%v30\n\t"
|
||||||
:"=f"(asum)
|
"vfadb %%v24,%%v24,%%v31\n\t"
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
|
"vrepg %%v25,%%v24,1\n\t"
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
|
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||||
);
|
"vsteg %%v24,%[asum],0"
|
||||||
|
: [asum] "=Q"(asum),[n] "+&r"(n)
|
||||||
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
|
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||||
|
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
return asum;
|
return asum;
|
||||||
}
|
}
|
||||||
|
|
||||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
{
|
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ip = 0;
|
BLASLONG ip = 0;
|
||||||
FLOAT sumf = 0.0;
|
FLOAT sumf = 0.0;
|
||||||
BLASLONG n1;
|
BLASLONG n1;
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (sumf);
|
||||||
|
|
||||||
if ( inc_x == 1 )
|
if (inc_x == 1) {
|
||||||
{
|
|
||||||
|
|
||||||
n1 = n & -16;
|
n1 = n & -16;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
|
|
||||||
sumf = zasum_kernel_16(n1, x);
|
sumf = zasum_kernel_16(n1, x);
|
||||||
i = n1;
|
i = n1;
|
||||||
ip = 2 * n1;
|
ip = 2 * n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
|
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
|
||||||
i++;
|
i++;
|
||||||
ip += 2;
|
ip += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
inc_x2 = 2 * inc_x;
|
inc_x2 = 2 * inc_x;
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
|
sumf += ABS(x[ip]) + ABS(x[ip + 1]);
|
||||||
ip += inc_x2;
|
ip += inc_x2;
|
||||||
i++;
|
i++;
|
||||||
|
|
@ -161,5 +151,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
}
|
}
|
||||||
return (sumf);
|
return (sumf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,96 +27,91 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
|
||||||
{
|
__asm__(
|
||||||
__asm__ volatile(
|
|
||||||
#if !defined(CONJ)
|
#if !defined(CONJ)
|
||||||
"vlrepg %%v0,0(%3) \n\t"
|
"vlrepg %%v0,0(%[alpha])\n\t"
|
||||||
"vleg %%v1,8(%3),0 \n\t"
|
"vleg %%v1,8(%[alpha]),0\n\t"
|
||||||
"wflcdb %%v1,%%v1\n\t"
|
"wflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v1,8(%3),1 \n\t"
|
"vleg %%v1,8(%[alpha]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v0,0(%3),1 \n\t"
|
"vleg %%v0,0(%[alpha]),1\n\t"
|
||||||
"vflcdb %%v0,%%v0\n\t"
|
"vflcdb %%v0,%%v0\n\t"
|
||||||
"vleg %%v0,0(%3),0 \n\t"
|
"vleg %%v0,0(%[alpha]),0\n\t"
|
||||||
"vlrepg %%v1,8(%3) \n\t"
|
"vlrepg %%v1,8(%[alpha])\n\t"
|
||||||
#endif
|
#endif
|
||||||
"srlg %%r0,%0,3 \n\t"
|
"srlg %[n],%[n],3\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v8,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v9,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%1) \n\t"
|
"vl %%v10,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%1) \n\t"
|
"vl %%v11,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%1) \n\t"
|
"vl %%v12,0(%%r1,%[y])\n\t"
|
||||||
"vl %%v20,0(%%r1,%2) \n\t"
|
"vl %%v13,16(%%r1,%[y])\n\t"
|
||||||
"vl %%v21,16(%%r1,%2) \n\t"
|
"vl %%v14,32(%%r1,%[y])\n\t"
|
||||||
"vl %%v22,32(%%r1,%2) \n\t"
|
"vl %%v15,48(%%r1,%[y])\n\t"
|
||||||
"vl %%v23,48(%%r1,%2) \n\t"
|
"vl %%v16,64(%%r1,%[x])\n\t"
|
||||||
"vpdi %%v24,%%v16,%%v16,4 \n\t"
|
"vl %%v17,80(%%r1,%[x])\n\t"
|
||||||
"vpdi %%v25,%%v17,%%v17,4 \n\t"
|
"vl %%v18,96(%%r1,%[x])\n\t"
|
||||||
"vpdi %%v26,%%v18,%%v18,4 \n\t"
|
"vl %%v19,112(%%r1,%[x])\n\t"
|
||||||
"vpdi %%v27,%%v19,%%v19,4 \n\t"
|
"vl %%v20,64(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v21,80(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
|
"vl %%v22,96(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
|
"vl %%v23,112(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
|
"vpdi %%v24,%%v8,%%v8,4\n\t"
|
||||||
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"
|
"vpdi %%v25,%%v9,%%v9,4\n\t"
|
||||||
|
"vpdi %%v26,%%v10,%%v10,4\n\t"
|
||||||
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
|
"vpdi %%v27,%%v11,%%v11,4\n\t"
|
||||||
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
|
"vpdi %%v28,%%v16,%%v16,4\n\t"
|
||||||
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
|
"vpdi %%v29,%%v17,%%v17,4\n\t"
|
||||||
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"
|
"vpdi %%v30,%%v18,%%v18,4\n\t"
|
||||||
|
"vpdi %%v31,%%v19,%%v19,4\n\t"
|
||||||
"vst %%v28,0(%%r1,%2) \n\t"
|
"vfmadb %%v8,%%v8,%%v0,%%v12\n\t"
|
||||||
"vst %%v29,16(%%r1,%2) \n\t"
|
"vfmadb %%v9,%%v9,%%v0,%%v13\n\t"
|
||||||
"vst %%v30,32(%%r1,%2) \n\t"
|
"vfmadb %%v10,%%v10,%%v0,%%v14\n\t"
|
||||||
"vst %%v31,48(%%r1,%2) \n\t"
|
"vfmadb %%v11,%%v11,%%v0,%%v15\n\t"
|
||||||
|
"vfmadb %%v16,%%v16,%%v0,%%v20\n\t"
|
||||||
"vl %%v16,64(%%r1,%1) \n\t"
|
"vfmadb %%v17,%%v17,%%v0,%%v21\n\t"
|
||||||
"vl %%v17,80(%%r1,%1) \n\t"
|
"vfmadb %%v18,%%v18,%%v0,%%v22\n\t"
|
||||||
"vl %%v18,96(%%r1,%1) \n\t"
|
"vfmadb %%v19,%%v19,%%v0,%%v23\n\t"
|
||||||
"vl %%v19,112(%%r1,%1) \n\t"
|
"vfmadb %%v8,%%v24,%%v1,%%v8\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vfmadb %%v9,%%v25,%%v1,%%v9\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vfmadb %%v10,%%v26,%%v1,%%v10\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vfmadb %%v11,%%v27,%%v1,%%v11\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
"vfmadb %%v16,%%v28,%%v1,%%v16\n\t"
|
||||||
"vpdi %%v24,%%v16,%%v16,4 \n\t"
|
"vfmadb %%v17,%%v29,%%v1,%%v17\n\t"
|
||||||
"vpdi %%v25,%%v17,%%v17,4 \n\t"
|
"vfmadb %%v18,%%v30,%%v1,%%v18\n\t"
|
||||||
"vpdi %%v26,%%v18,%%v18,4 \n\t"
|
"vfmadb %%v19,%%v31,%%v1,%%v19\n\t"
|
||||||
"vpdi %%v27,%%v19,%%v19,4 \n\t"
|
"vst %%v8,0(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v9,16(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
|
"vst %%v10,32(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
|
"vst %%v11,48(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
|
"vst %%v16,64(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"
|
"vst %%v17,80(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v18,96(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
|
"vst %%v19,112(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
|
|
||||||
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
|
|
||||||
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"
|
|
||||||
|
|
||||||
"vst %%v28,64(%%r1,%2) \n\t"
|
|
||||||
"vst %%v29,80(%%r1,%2) \n\t"
|
|
||||||
"vst %%v30,96(%%r1,%2) \n\t"
|
|
||||||
"vst %%v31,112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13",
|
||||||
|
"v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||||
|
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
|
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||||
|
BLASLONG dummy2) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
FLOAT da[2] __attribute__ ((aligned(16)));
|
FLOAT da[2] __attribute__ ((aligned(16)));
|
||||||
|
|
||||||
if (n <= 0) return (0);
|
if (n <= 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ((inc_x == 1) && (inc_y == 1)) {
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
|
|
||||||
|
|
@ -143,7 +138,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
}
|
}
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inc_x *= 2;
|
inc_x *= 2;
|
||||||
|
|
@ -166,5 +160,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,46 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||||
{
|
__asm__("srlg %[n],%[n],4\n\t"
|
||||||
__asm__ volatile (
|
|
||||||
"lgr %%r1,%1 \n\t"
|
|
||||||
"lgr %%r2,%2 \n\t"
|
|
||||||
"srlg %%r0,%0,4 \n\t"
|
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1) \n\t"
|
"pfd 1, 1024(%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r2) \n\t"
|
"pfd 2, 1024(%[y])\n\t"
|
||||||
"mvc 0(256,%%r2),0(%%r1) \n\t"
|
"mvc 0(256,%[y]),0(%[x])\n\t"
|
||||||
"agfi %%r1,256 \n\t"
|
"la %[x],256(%[x])\n\t"
|
||||||
"agfi %%r2,256 \n\t"
|
"la %[y],256(%[y])\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y),
|
||||||
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
|
[n] "+&r"(n)
|
||||||
:"memory","cc","r0","r1","r2"
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x)
|
||||||
);
|
: "cc");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||||
{
|
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
|
|
||||||
if ( n <= 0 ) return(0);
|
if (n <= 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -16;
|
BLASLONG n1 = n & -16;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
zcopy_kernel_16(n1, x, y);
|
zcopy_kernel_16(n1, x, y);
|
||||||
i = n1;
|
i = n1;
|
||||||
ix = n1 * 2;
|
ix = n1 * 2;
|
||||||
iy = n1 * 2;
|
iy = n1 * 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
y[iy] = x[iy];
|
y[iy] = x[iy];
|
||||||
y[iy + 1] = x[ix + 1];
|
y[iy + 1] = x[ix + 1];
|
||||||
ix += 2;
|
ix += 2;
|
||||||
|
|
@ -75,16 +68,12 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG inc_x2 = 2 * inc_x;
|
BLASLONG inc_x2 = 2 * inc_x;
|
||||||
BLASLONG inc_y2 = 2 * inc_y;
|
BLASLONG inc_y2 = 2 * inc_y;
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
y[iy] = x[ix];
|
y[iy] = x[ix];
|
||||||
y[iy + 1] = x[ix + 1];
|
y[iy + 1] = x[ix + 1];
|
||||||
ix += inc_x2;
|
ix += inc_x2;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,10 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
|
||||||
{
|
__asm__("vzero %%v24\n\t"
|
||||||
__asm__ volatile(
|
|
||||||
"vzero %%v24 \n\t"
|
|
||||||
"vzero %%v25\n\t"
|
"vzero %%v25\n\t"
|
||||||
"vzero %%v26\n\t"
|
"vzero %%v26\n\t"
|
||||||
"vzero %%v27\n\t"
|
"vzero %%v27\n\t"
|
||||||
|
|
@ -38,25 +36,23 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||||
"vzero %%v29\n\t"
|
"vzero %%v29\n\t"
|
||||||
"vzero %%v30\n\t"
|
"vzero %%v30\n\t"
|
||||||
"vzero %%v31\n\t"
|
"vzero %%v31\n\t"
|
||||||
"srlg %%r0,%0,3 \n\t"
|
"srlg %[n],%[n],3\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1, 1024(%%r1,%1) \n\t"
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 1, 1024(%%r1,%2) \n\t"
|
"pfd 1, 1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
"vl %%v0, 0(%%r1,%[y])\n\t"
|
||||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
"vl %%v1, 16(%%r1,%[y])\n\t"
|
||||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
"vl %%v2, 32(%%r1,%[y])\n\t"
|
||||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
"vl %%v3, 48(%%r1,%[y])\n\t"
|
||||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
|
||||||
"vpdi %%v20,%%v16,%%v16,4\n\t"
|
"vpdi %%v20,%%v16,%%v16,4\n\t"
|
||||||
"vpdi %%v21,%%v17,%%v17,4\n\t"
|
"vpdi %%v21,%%v17,%%v17,4\n\t"
|
||||||
"vpdi %%v22,%%v18,%%v18,4\n\t"
|
"vpdi %%v22,%%v18,%%v18,4\n\t"
|
||||||
"vpdi %%v23,%%v19,%%v19,4\n\t"
|
"vpdi %%v23,%%v19,%%v19,4\n\t"
|
||||||
|
|
||||||
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
|
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
|
||||||
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
|
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
|
||||||
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
|
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
|
||||||
|
|
@ -65,20 +61,18 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||||
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
|
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
|
||||||
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
|
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
|
||||||
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
|
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
|
||||||
|
"vl %%v16, 64(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 64(%%r1,%1) \n\t"
|
"vl %%v17, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 80(%%r1,%1) \n\t"
|
"vl %%v18, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 96(%%r1,%1) \n\t"
|
"vl %%v19, 112(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 112(%%r1,%1) \n\t"
|
"vl %%v0, 64(%%r1,%[y])\n\t"
|
||||||
"vl %%v0, 64(%%r1,%2) \n\t"
|
"vl %%v1, 80(%%r1,%[y])\n\t"
|
||||||
"vl %%v1, 80(%%r1,%2) \n\t"
|
"vl %%v2, 96(%%r1,%[y])\n\t"
|
||||||
"vl %%v2, 96(%%r1,%2) \n\t"
|
"vl %%v3, 112(%%r1,%[y])\n\t"
|
||||||
"vl %%v3, 112(%%r1,%2) \n\t"
|
|
||||||
"vpdi %%v20,%%v16,%%v16,4\n\t"
|
"vpdi %%v20,%%v16,%%v16,4\n\t"
|
||||||
"vpdi %%v21,%%v17,%%v17,4\n\t"
|
"vpdi %%v21,%%v17,%%v17,4\n\t"
|
||||||
"vpdi %%v22,%%v18,%%v18,4\n\t"
|
"vpdi %%v22,%%v18,%%v18,4\n\t"
|
||||||
"vpdi %%v23,%%v19,%%v19,4\n\t"
|
"vpdi %%v23,%%v19,%%v19,4\n\t"
|
||||||
|
|
||||||
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
|
"vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
|
||||||
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
|
"vfmadb %%v25,%%v20,%%v0,%%v25\n\t"
|
||||||
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
|
"vfmadb %%v26,%%v17,%%v1,%%v26\n\t"
|
||||||
|
|
@ -87,30 +81,33 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
|
||||||
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
|
"vfmadb %%v29,%%v22,%%v2,%%v29\n\t"
|
||||||
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
|
"vfmadb %%v30,%%v19,%%v3,%%v30\n\t"
|
||||||
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
|
"vfmadb %%v31,%%v23,%%v3,%%v31\n\t"
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
"vfadb %%v24,%%v24,%%v26\n\t"
|
"vfadb %%v24,%%v24,%%v26\n\t"
|
||||||
"vfadb %%v24,%%v24,%%v28\n\t"
|
"vfadb %%v24,%%v24,%%v28\n\t"
|
||||||
"vfadb %%v24,%%v24,%%v30\n\t"
|
"vfadb %%v24,%%v24,%%v30\n\t"
|
||||||
"vfadb %%v25,%%v25,%%v27\n\t"
|
"vfadb %%v25,%%v25,%%v27\n\t"
|
||||||
"vfadb %%v25,%%v25,%%v29\n\t"
|
"vfadb %%v25,%%v25,%%v29\n\t"
|
||||||
"vfadb %%v25,%%v25,%%v31\n\t"
|
"vfadb %%v25,%%v25,%%v31\n\t"
|
||||||
"vsteg %%v24,0(%3),0 \n\t"
|
"vsteg %%v24,0(%[d]),0\n\t"
|
||||||
"vsteg %%v24,8(%3),1 \n\t"
|
"vsteg %%v24,8(%[d]),1\n\t"
|
||||||
"vsteg %%v25,16(%3),1 \n\t"
|
"vsteg %%v25,16(%[d]),1\n\t"
|
||||||
"vsteg %%v25,24(%3),0 "
|
"vsteg %%v25,24(%[d]),0"
|
||||||
:
|
: "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
|
: [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20",
|
||||||
|
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||||
|
BLASLONG inc_y) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
BLASLONG ix, iy;
|
BLASLONG ix, iy;
|
||||||
OPENBLAS_COMPLEX_FLOAT result;
|
OPENBLAS_COMPLEX_FLOAT result;
|
||||||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
|
FLOAT dot[4] __attribute__ ((aligned(16))) = {
|
||||||
|
0.0, 0.0, 0.0, 0.0};
|
||||||
|
|
||||||
if (n <= 0) {
|
if (n <= 0) {
|
||||||
CREAL(result) = 0.0;
|
CREAL(result) = 0.0;
|
||||||
|
|
@ -141,7 +138,6 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
i = 0;
|
i = 0;
|
||||||
ix = 0;
|
ix = 0;
|
||||||
|
|
@ -174,5 +170,3 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||||
return (result);
|
return (result);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2014, The OpenBLAS Project
|
Copyright (c) 2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -25,276 +25,259 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#define NBMAX 1024
|
#define NBMAX 1024
|
||||||
|
|
||||||
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
|
||||||
{
|
register FLOAT *ap0 = ap[0];
|
||||||
__asm__ volatile (
|
register FLOAT *ap1 = ap[1];
|
||||||
"vl %%v16,0(%5) \n\t"
|
register FLOAT *ap2 = ap[2];
|
||||||
"vl %%v17,16(%5) \n\t"
|
register FLOAT *ap3 = ap[3];
|
||||||
"vl %%v18,32(%5) \n\t"
|
|
||||||
"vl %%v19,48(%5) \n\t"
|
__asm__("vl %%v16,0(%[x])\n\t"
|
||||||
|
"vl %%v17,16(%[x])\n\t"
|
||||||
|
"vl %%v18,32(%[x])\n\t"
|
||||||
|
"vl %%v19,48(%[x])\n\t"
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vleg %%v20,8(%5),0 \n\t"
|
"vleg %%v20,8(%[x]),0\n\t"
|
||||||
"wflcdb %%v20,%%v20\n\t"
|
"wflcdb %%v20,%%v20\n\t"
|
||||||
"vleg %%v20,0(%5),1 \n\t"
|
"vleg %%v20,0(%[x]),1\n\t"
|
||||||
"vleg %%v21,24(%5),0 \n\t"
|
"vleg %%v21,24(%[x]),0\n\t"
|
||||||
"wflcdb %%v21,%%v21\n\t"
|
"wflcdb %%v21,%%v21\n\t"
|
||||||
"vleg %%v21,16(%5),1 \n\t"
|
"vleg %%v21,16(%[x]),1\n\t"
|
||||||
"vleg %%v22,40(%5),0 \n\t"
|
"vleg %%v22,40(%[x]),0\n\t"
|
||||||
"wflcdb %%v22,%%v22\n\t"
|
"wflcdb %%v22,%%v22\n\t"
|
||||||
"vleg %%v22,32(%5),1 \n\t"
|
"vleg %%v22,32(%[x]),1\n\t"
|
||||||
"vleg %%v23,56(%5),0 \n\t"
|
"vleg %%v23,56(%[x]),0\n\t"
|
||||||
"wflcdb %%v23,%%v23\n\t"
|
"wflcdb %%v23,%%v23\n\t"
|
||||||
"vleg %%v23,48(%5),1 \n\t"
|
"vleg %%v23,48(%[x]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v20,0(%5),1 \n\t"
|
"vleg %%v20,0(%[x]),1\n\t"
|
||||||
"vflcdb %%v20,%%v20\n\t"
|
"vflcdb %%v20,%%v20\n\t"
|
||||||
"vleg %%v20,8(%5),0 \n\t"
|
"vleg %%v20,8(%[x]),0\n\t"
|
||||||
"vleg %%v21,16(%5),1 \n\t"
|
"vleg %%v21,16(%[x]),1\n\t"
|
||||||
"vflcdb %%v21,%%v21\n\t"
|
"vflcdb %%v21,%%v21\n\t"
|
||||||
"vleg %%v21,24(%5),0 \n\t"
|
"vleg %%v21,24(%[x]),0\n\t"
|
||||||
"vleg %%v22,32(%5),1 \n\t"
|
"vleg %%v22,32(%[x]),1\n\t"
|
||||||
"vflcdb %%v22,%%v22\n\t"
|
"vflcdb %%v22,%%v22\n\t"
|
||||||
"vleg %%v22,40(%5),0 \n\t"
|
"vleg %%v22,40(%[x]),0\n\t"
|
||||||
"vleg %%v23,48(%5),1 \n\t"
|
"vleg %%v23,48(%[x]),1\n\t"
|
||||||
"vflcdb %%v23,%%v23\n\t"
|
"vflcdb %%v23,%%v23\n\t"
|
||||||
"vleg %%v23,56(%5),0 \n\t"
|
"vleg %%v23,56(%[x]),0\n\t"
|
||||||
#endif
|
#endif
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,1 \n\t"
|
"srlg %[n],%[n],1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||||
"pfd 1,1024(%%r1,%3) \n\t"
|
"pfd 1,1024(%%r1,%[ap2])\n\t"
|
||||||
"pfd 1,1024(%%r1,%4) \n\t"
|
"pfd 1,1024(%%r1,%[ap3])\n\t"
|
||||||
"pfd 2,1024(%%r1,%6) \n\t"
|
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v0,0(%%r1,%[y])\n\t"
|
||||||
"vlrepg %%v24,0(%%r1,%1) \n\t"
|
"vl %%v1,16(%%r1,%[y])\n\t"
|
||||||
"vlrepg %%v25,8(%%r1,%1) \n\t"
|
"vlrepg %%v24,0(%%r1,%[ap0])\n\t"
|
||||||
"vlrepg %%v26,0(%%r1,%2) \n\t"
|
"vlrepg %%v25,8(%%r1,%[ap0])\n\t"
|
||||||
"vlrepg %%v27,8(%%r1,%2) \n\t"
|
"vlrepg %%v26,0(%%r1,%[ap1])\n\t"
|
||||||
|
"vlrepg %%v27,8(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v0,0(%%r1,%6) \n\t"
|
"vlrepg %%v28,16(%%r1,%[ap0])\n\t"
|
||||||
|
"vlrepg %%v29,24(%%r1,%[ap0])\n\t"
|
||||||
|
"vlrepg %%v30,16(%%r1,%[ap1])\n\t"
|
||||||
|
"vlrepg %%v31,24(%%r1,%[ap1])\n\t"
|
||||||
"vfmadb %%v0,%%v24,%%v16,%%v0\n\t"
|
"vfmadb %%v0,%%v24,%%v16,%%v0\n\t"
|
||||||
|
"vfmadb %%v1,%%v28,%%v16,%%v1\n\t"
|
||||||
"vfmadb %%v0,%%v25,%%v20,%%v0\n\t"
|
"vfmadb %%v0,%%v25,%%v20,%%v0\n\t"
|
||||||
|
"vfmadb %%v1,%%v29,%%v20,%%v1\n\t"
|
||||||
"vfmadb %%v0,%%v26,%%v17,%%v0\n\t"
|
"vfmadb %%v0,%%v26,%%v17,%%v0\n\t"
|
||||||
|
"vfmadb %%v1,%%v30,%%v17,%%v1\n\t"
|
||||||
"vfmadb %%v0,%%v27,%%v21,%%v0\n\t"
|
"vfmadb %%v0,%%v27,%%v21,%%v0\n\t"
|
||||||
|
"vfmadb %%v1,%%v31,%%v21,%%v1\n\t"
|
||||||
"vlrepg %%v28,0(%%r1,%3) \n\t"
|
"vlrepg %%v24,0(%%r1,%[ap2])\n\t"
|
||||||
"vlrepg %%v29,8(%%r1,%3) \n\t"
|
"vlrepg %%v25,8(%%r1,%[ap2])\n\t"
|
||||||
"vlrepg %%v30,0(%%r1,%4) \n\t"
|
"vlrepg %%v26,0(%%r1,%[ap3])\n\t"
|
||||||
"vlrepg %%v31,8(%%r1,%4) \n\t"
|
"vlrepg %%v27,8(%%r1,%[ap3])\n\t"
|
||||||
|
"vlrepg %%v28,16(%%r1,%[ap2])\n\t"
|
||||||
"vfmadb %%v0,%%v28,%%v18,%%v0 \n\t"
|
"vlrepg %%v29,24(%%r1,%[ap2])\n\t"
|
||||||
"vfmadb %%v0,%%v29,%%v22,%%v0 \n\t"
|
"vlrepg %%v30,16(%%r1,%[ap3])\n\t"
|
||||||
"vfmadb %%v0,%%v30,%%v19,%%v0 \n\t"
|
"vlrepg %%v31,24(%%r1,%[ap3])\n\t"
|
||||||
"vfmadb %%v0,%%v31,%%v23,%%v0 \n\t"
|
"vfmadb %%v0,%%v24,%%v18,%%v0\n\t"
|
||||||
"vst %%v0,0(%%r1,%6) \n\t"
|
"vfmadb %%v1,%%v28,%%v18,%%v1\n\t"
|
||||||
|
"vfmadb %%v0,%%v25,%%v22,%%v0\n\t"
|
||||||
"vlrepg %%v24,16(%%r1,%1) \n\t"
|
"vfmadb %%v1,%%v29,%%v22,%%v1\n\t"
|
||||||
"vlrepg %%v25,24(%%r1,%1) \n\t"
|
"vfmadb %%v0,%%v26,%%v19,%%v0\n\t"
|
||||||
"vlrepg %%v26,16(%%r1,%2) \n\t"
|
"vfmadb %%v1,%%v30,%%v19,%%v1\n\t"
|
||||||
"vlrepg %%v27,24(%%r1,%2) \n\t"
|
"vfmadb %%v0,%%v27,%%v23,%%v0\n\t"
|
||||||
|
"vfmadb %%v1,%%v31,%%v23,%%v1\n\t"
|
||||||
"vl %%v0,16(%%r1,%6) \n\t"
|
"vst %%v0,0(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v0,%%v24,%%v16,%%v0 \n\t"
|
"vst %%v1,16(%%r1,%[y])\n\t"
|
||||||
"vfmadb %%v0,%%v25,%%v20,%%v0 \n\t"
|
|
||||||
"vfmadb %%v0,%%v26,%%v17,%%v0 \n\t"
|
|
||||||
"vfmadb %%v0,%%v27,%%v21,%%v0 \n\t"
|
|
||||||
|
|
||||||
"vlrepg %%v28,16(%%r1,%3) \n\t"
|
|
||||||
"vlrepg %%v29,24(%%r1,%3) \n\t"
|
|
||||||
"vlrepg %%v30,16(%%r1,%4) \n\t"
|
|
||||||
"vlrepg %%v31,24(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"vfmadb %%v0,%%v28,%%v18,%%v0 \n\t"
|
|
||||||
"vfmadb %%v0,%%v29,%%v22,%%v0 \n\t"
|
|
||||||
"vfmadb %%v0,%%v30,%%v19,%%v0 \n\t"
|
|
||||||
"vfmadb %%v0,%%v31,%%v23,%%v0 \n\t"
|
|
||||||
"vst %%v0,16(%%r1,%6) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,32\n\t"
|
"agfi %%r1,32\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
|
||||||
);
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
|
||||||
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
|
||||||
|
"m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x)
|
||||||
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
|
||||||
{
|
register FLOAT *ap0 = ap[0];
|
||||||
__asm__ volatile (
|
register FLOAT *ap1 = ap[1];
|
||||||
"vl %%v16,0(%3) \n\t"
|
|
||||||
"vl %%v17,16(%3) \n\t"
|
__asm__("vl %%v16,0(%[x])\n\t"
|
||||||
|
"vl %%v17,16(%[x])\n\t"
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vleg %%v18,8(%3),0 \n\t"
|
"vleg %%v18,8(%[x]),0\n\t"
|
||||||
"wflcdb %%v18,%%v18\n\t"
|
"wflcdb %%v18,%%v18\n\t"
|
||||||
"vleg %%v18,0(%3),1 \n\t"
|
"vleg %%v18,0(%[x]),1\n\t"
|
||||||
"vleg %%v19,24(%3),0 \n\t"
|
"vleg %%v19,24(%[x]),0\n\t"
|
||||||
"wflcdb %%v19,%%v19\n\t"
|
"wflcdb %%v19,%%v19\n\t"
|
||||||
"vleg %%v19,16(%3),1 \n\t"
|
"vleg %%v19,16(%[x]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v18,0(%3),1 \n\t"
|
"vleg %%v18,0(%[x]),1\n\t"
|
||||||
"vflcdb %%v18,%%v18\n\t"
|
"vflcdb %%v18,%%v18\n\t"
|
||||||
"vleg %%v18,8(%3),0 \n\t"
|
"vleg %%v18,8(%[x]),0\n\t"
|
||||||
"vleg %%v19,16(%3),1 \n\t"
|
"vleg %%v19,16(%[x]),1\n\t"
|
||||||
"vflcdb %%v19,%%v19\n\t"
|
"vflcdb %%v19,%%v19\n\t"
|
||||||
"vleg %%v19,24(%3),0 \n\t"
|
"vleg %%v19,24(%[x]),0\n\t"
|
||||||
#endif
|
#endif
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,1 \n\t"
|
"srlg %[n],%[n],1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||||
"pfd 2,1024(%%r1,%4) \n\t"
|
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v0,0(%%r1,%[y])\n\t"
|
||||||
"vlrepg %%v20,0(%%r1,%1) \n\t"
|
"vl %%v1,16(%%r1,%[y])\n\t"
|
||||||
"vlrepg %%v21,8(%%r1,%1) \n\t"
|
"vlrepg %%v20,0(%%r1,%[ap0])\n\t"
|
||||||
"vlrepg %%v22,0(%%r1,%2) \n\t"
|
"vlrepg %%v21,8(%%r1,%[ap0])\n\t"
|
||||||
"vlrepg %%v23,8(%%r1,%2) \n\t"
|
"vlrepg %%v22,0(%%r1,%[ap1])\n\t"
|
||||||
|
"vlrepg %%v23,8(%%r1,%[ap1])\n\t"
|
||||||
"vl %%v0,0(%%r1,%4) \n\t"
|
"vlrepg %%v24,16(%%r1,%[ap0])\n\t"
|
||||||
|
"vlrepg %%v25,24(%%r1,%[ap0])\n\t"
|
||||||
|
"vlrepg %%v26,16(%%r1,%[ap1])\n\t"
|
||||||
|
"vlrepg %%v27,24(%%r1,%[ap1])\n\t"
|
||||||
"vfmadb %%v0,%%v20,%%v16,%%v0\n\t"
|
"vfmadb %%v0,%%v20,%%v16,%%v0\n\t"
|
||||||
|
"vfmadb %%v1,%%v24,%%v16,%%v1\n\t"
|
||||||
"vfmadb %%v0,%%v21,%%v18,%%v0\n\t"
|
"vfmadb %%v0,%%v21,%%v18,%%v0\n\t"
|
||||||
|
"vfmadb %%v1,%%v25,%%v18,%%v1\n\t"
|
||||||
"vfmadb %%v0,%%v22,%%v17,%%v0\n\t"
|
"vfmadb %%v0,%%v22,%%v17,%%v0\n\t"
|
||||||
|
"vfmadb %%v1,%%v26,%%v17,%%v1\n\t"
|
||||||
"vfmadb %%v0,%%v23,%%v19,%%v0\n\t"
|
"vfmadb %%v0,%%v23,%%v19,%%v0\n\t"
|
||||||
"vst %%v0,0(%%r1,%4) \n\t"
|
"vfmadb %%v1,%%v27,%%v19,%%v1\n\t"
|
||||||
|
"vst %%v0,0(%%r1,%[y])\n\t"
|
||||||
"vlrepg %%v20,16(%%r1,%1) \n\t"
|
"vst %%v1,16(%%r1,%[y])\n\t"
|
||||||
"vlrepg %%v21,24(%%r1,%1) \n\t"
|
|
||||||
"vlrepg %%v22,16(%%r1,%2) \n\t"
|
|
||||||
"vlrepg %%v23,24(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vl %%v0,16(%%r1,%4) \n\t"
|
|
||||||
"vfmadb %%v0,%%v20,%%v16,%%v0 \n\t"
|
|
||||||
"vfmadb %%v0,%%v21,%%v18,%%v0 \n\t"
|
|
||||||
"vfmadb %%v0,%%v22,%%v17,%%v0 \n\t"
|
|
||||||
"vfmadb %%v0,%%v23,%%v19,%%v0 \n\t"
|
|
||||||
"vst %%v0,16(%%r1,%4) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,32\n\t"
|
"agfi %%r1,32\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
|
||||||
);
|
"m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x)
|
||||||
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23", "v24", "v25", "v26", "v27");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||||
{
|
__asm__("vl %%v16,0(%[x])\n\t"
|
||||||
__asm__ volatile (
|
|
||||||
"vl %%v16,0(%2) \n\t"
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vleg %%v17,8(%2),0 \n\t"
|
"vleg %%v17,8(%[x]),0\n\t"
|
||||||
"wflcdb %%v17,%%v17\n\t"
|
"wflcdb %%v17,%%v17\n\t"
|
||||||
"vleg %%v17,0(%2),1 \n\t"
|
"vleg %%v17,0(%[x]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v17,0(%2),1 \n\t"
|
"vleg %%v17,0(%[x]),1\n\t"
|
||||||
"vflcdb %%v17,%%v17\n\t"
|
"vflcdb %%v17,%%v17\n\t"
|
||||||
"vleg %%v17,8(%2),0 \n\t"
|
"vleg %%v17,8(%[x]),0\n\t"
|
||||||
#endif
|
#endif
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,1 \n\t"
|
"srlg %[n],%[n],1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap])\n\t"
|
||||||
"pfd 2,1024(%%r1,%3) \n\t"
|
"pfd 2,1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v0,0(%%r1,%[y])\n\t"
|
||||||
"vlrepg %%v18,0(%%r1,%1) \n\t"
|
"vl %%v1,16(%%r1,%[y])\n\t"
|
||||||
"vlrepg %%v19,8(%%r1,%1) \n\t"
|
"vlrepg %%v18,0(%%r1,%[ap])\n\t"
|
||||||
|
"vlrepg %%v19,8(%%r1,%[ap])\n\t"
|
||||||
"vl %%v0,0(%%r1,%3) \n\t"
|
"vlrepg %%v20,16(%%r1,%[ap])\n\t"
|
||||||
|
"vlrepg %%v21,24(%%r1,%[ap])\n\t"
|
||||||
"vfmadb %%v0,%%v18,%%v16,%%v0\n\t"
|
"vfmadb %%v0,%%v18,%%v16,%%v0\n\t"
|
||||||
|
"vfmadb %%v1,%%v20,%%v16,%%v1\n\t"
|
||||||
"vfmadb %%v0,%%v19,%%v17,%%v0\n\t"
|
"vfmadb %%v0,%%v19,%%v17,%%v0\n\t"
|
||||||
"vst %%v0,0(%%r1,%3) \n\t"
|
"vfmadb %%v1,%%v21,%%v17,%%v1\n\t"
|
||||||
|
"vst %%v0,0(%%r1,%[y])\n\t"
|
||||||
"vlrepg %%v18,16(%%r1,%1) \n\t"
|
"vst %%v1,16(%%r1,%[y])\n\t"
|
||||||
"vlrepg %%v19,24(%%r1,%1) \n\t"
|
|
||||||
|
|
||||||
"vl %%v0,16(%%r1,%3) \n\t"
|
|
||||||
"vfmadb %%v0,%%v18,%%v16,%%v0 \n\t"
|
|
||||||
"vfmadb %%v0,%%v19,%%v17,%%v0 \n\t"
|
|
||||||
"vst %%v0,16(%%r1,%3) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,32\n\t"
|
"agfi %%r1,32\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19"
|
"m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i)
|
static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r,
|
||||||
{
|
FLOAT alpha_i) {
|
||||||
__asm__ volatile (
|
__asm__(
|
||||||
#if !defined(XCONJ)
|
#if !defined(XCONJ)
|
||||||
"vlrepg %%v0,%3 \n\t"
|
"vlrepg %%v0,%[alpha_r]\n\t"
|
||||||
"vleg %%v1,%4,0 \n\t"
|
"vleg %%v1,%[alpha_i],0\n\t"
|
||||||
"wflcdb %%v1,%%v1\n\t"
|
"wflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v1,%4,1 \n\t"
|
"vleg %%v1,%[alpha_i],1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v0,%3,1 \n\t"
|
"vleg %%v0,%[alpha_r],1\n\t"
|
||||||
"vflcdb %%v0,%%v0\n\t"
|
"vflcdb %%v0,%%v0\n\t"
|
||||||
"vleg %%v0,%3,0 \n\t"
|
"vleg %%v0,%[alpha_r],0\n\t"
|
||||||
"vlrepg %%v1,%4 \n\t"
|
"vlrepg %%v1,%[alpha_i]\n\t"
|
||||||
#endif
|
#endif
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,2 \n\t"
|
"srlg %[n],%[n],2\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[src])\n\t"
|
||||||
"pfd 2,1024(%%r1,%2) \n\t"
|
"pfd 2,1024(%%r1,%[dest])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[src])\n\t"
|
||||||
"vl %%v16,0(%%r1,%1) \n\t"
|
"vl %%v17,16(%%r1,%[src])\n\t"
|
||||||
"vl %%v17,16(%%r1,%1) \n\t"
|
"vl %%v18,32(%%r1,%[src])\n\t"
|
||||||
"vl %%v18,32(%%r1,%1) \n\t"
|
"vl %%v19,48(%%r1,%[src])\n\t"
|
||||||
"vl %%v19,48(%%r1,%1) \n\t"
|
"vl %%v20,0(%%r1,%[dest])\n\t"
|
||||||
"vl %%v20,0(%%r1,%2) \n\t"
|
"vl %%v21,16(%%r1,%[dest])\n\t"
|
||||||
"vl %%v21,16(%%r1,%2) \n\t"
|
"vl %%v22,32(%%r1,%[dest])\n\t"
|
||||||
"vl %%v22,32(%%r1,%2) \n\t"
|
"vl %%v23,48(%%r1,%[dest])\n\t"
|
||||||
"vl %%v23,48(%%r1,%2) \n\t"
|
|
||||||
"vpdi %%v24,%%v16,%%v16,4\n\t"
|
"vpdi %%v24,%%v16,%%v16,4\n\t"
|
||||||
"vpdi %%v25,%%v17,%%v17,4\n\t"
|
"vpdi %%v25,%%v17,%%v17,4\n\t"
|
||||||
"vpdi %%v26,%%v18,%%v18,4\n\t"
|
"vpdi %%v26,%%v18,%%v18,4\n\t"
|
||||||
"vpdi %%v27,%%v19,%%v19,4\n\t"
|
"vpdi %%v27,%%v19,%%v19,4\n\t"
|
||||||
|
|
||||||
"vfmadb %%v28,%%v16,%%v0,%%v20\n\t"
|
"vfmadb %%v28,%%v16,%%v0,%%v20\n\t"
|
||||||
"vfmadb %%v29,%%v17,%%v0,%%v21\n\t"
|
"vfmadb %%v29,%%v17,%%v0,%%v21\n\t"
|
||||||
"vfmadb %%v30,%%v18,%%v0,%%v22\n\t"
|
"vfmadb %%v30,%%v18,%%v0,%%v22\n\t"
|
||||||
"vfmadb %%v31,%%v19,%%v0,%%v23\n\t"
|
"vfmadb %%v31,%%v19,%%v0,%%v23\n\t"
|
||||||
|
|
||||||
"vfmadb %%v28,%%v24,%%v1,%%v28\n\t"
|
"vfmadb %%v28,%%v24,%%v1,%%v28\n\t"
|
||||||
"vfmadb %%v29,%%v25,%%v1,%%v29\n\t"
|
"vfmadb %%v29,%%v25,%%v1,%%v29\n\t"
|
||||||
"vfmadb %%v30,%%v26,%%v1,%%v30\n\t"
|
"vfmadb %%v30,%%v26,%%v1,%%v30\n\t"
|
||||||
"vfmadb %%v31,%%v27,%%v1,%%v31\n\t"
|
"vfmadb %%v31,%%v27,%%v1,%%v31\n\t"
|
||||||
|
"vst %%v28,0(%%r1,%[dest])\n\t"
|
||||||
"vst %%v28,0(%%r1,%2) \n\t"
|
"vst %%v29,16(%%r1,%[dest])\n\t"
|
||||||
"vst %%v29,16(%%r1,%2) \n\t"
|
"vst %%v30,32(%%r1,%[dest])\n\t"
|
||||||
"vst %%v30,32(%%r1,%2) \n\t"
|
"vst %%v31,48(%%r1,%[dest])\n\t"
|
||||||
"vst %%v31,48(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,64\n\t"
|
"agfi %%r1,64\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i)
|
: [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src),
|
||||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
[src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i)
|
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,
|
||||||
{
|
FLOAT alpha_r, FLOAT alpha_i) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
|
|
||||||
if ( inc_dest != 2 )
|
if (inc_dest != 2) {
|
||||||
{
|
|
||||||
|
|
||||||
FLOAT temp_r;
|
FLOAT temp_r;
|
||||||
FLOAT temp_i;
|
FLOAT temp_i;
|
||||||
for ( i=0; i<n; i++ )
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
#if !defined(XCONJ)
|
#if !defined(XCONJ)
|
||||||
temp_r = alpha_r * src[0] - alpha_i * src[1];
|
temp_r = alpha_r * src[0] - alpha_i * src[1];
|
||||||
temp_i = alpha_r * src[1] + alpha_i * src[0];
|
temp_i = alpha_r * src[1] + alpha_i * src[0];
|
||||||
|
|
@ -315,8 +298,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
|
||||||
add_y_4(n, src, dest, alpha_r, alpha_i);
|
add_y_4(n, src, dest, alpha_r, alpha_i);
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
{
|
FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||||
|
BLASLONG inc_y, FLOAT *buffer) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
FLOAT *a_ptr;
|
FLOAT *a_ptr;
|
||||||
FLOAT *x_ptr;
|
FLOAT *x_ptr;
|
||||||
|
|
@ -330,8 +314,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
BLASLONG lda4;
|
BLASLONG lda4;
|
||||||
FLOAT xbuffer[8], *ybuffer;
|
FLOAT xbuffer[8], *ybuffer;
|
||||||
|
|
||||||
if ( m < 1 ) return(0);
|
if (m < 1)
|
||||||
if ( n < 1 ) return(0);
|
return (0);
|
||||||
|
if (n < 1)
|
||||||
|
return (0);
|
||||||
|
|
||||||
ybuffer = buffer;
|
ybuffer = buffer;
|
||||||
|
|
||||||
|
|
@ -351,13 +337,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
|
|
||||||
BLASLONG NB = NBMAX;
|
BLASLONG NB = NBMAX;
|
||||||
|
|
||||||
while ( NB == NBMAX )
|
while (NB == NBMAX) {
|
||||||
{
|
|
||||||
|
|
||||||
m1 -= NB;
|
m1 -= NB;
|
||||||
if ( m1 < 0)
|
if (m1 < 0) {
|
||||||
{
|
if (m2 == 0)
|
||||||
if ( m2 == 0 ) break;
|
break;
|
||||||
NB = m2;
|
NB = m2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -370,11 +355,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
//zero_y(NB,ybuffer);
|
//zero_y(NB,ybuffer);
|
||||||
memset(ybuffer, 0, NB * 16);
|
memset(ybuffer, 0, NB * 16);
|
||||||
|
|
||||||
if ( inc_x == 2 )
|
if (inc_x == 2) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n1 ; i++)
|
for (i = 0; i < n1; i++) {
|
||||||
{
|
|
||||||
zgemv_kernel_4x4(NB, ap, x_ptr, ybuffer);
|
zgemv_kernel_4x4(NB, ap, x_ptr, ybuffer);
|
||||||
ap[0] += lda4;
|
ap[0] += lda4;
|
||||||
ap[1] += lda4;
|
ap[1] += lda4;
|
||||||
|
|
@ -384,27 +367,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += 8;
|
x_ptr += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( n2 & 2 )
|
if (n2 & 2) {
|
||||||
{
|
|
||||||
zgemv_kernel_4x2(NB, ap, x_ptr, ybuffer);
|
zgemv_kernel_4x2(NB, ap, x_ptr, ybuffer);
|
||||||
x_ptr += 4;
|
x_ptr += 4;
|
||||||
a_ptr += 2 * lda;
|
a_ptr += 2 * lda;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( n2 & 1 )
|
if (n2 & 1) {
|
||||||
{
|
|
||||||
zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
|
zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
|
||||||
/* x_ptr += 2;
|
/* x_ptr += 2;
|
||||||
a_ptr += lda; */
|
a_ptr += lda; */
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n1 ; i++)
|
for (i = 0; i < n1; i++) {
|
||||||
{
|
|
||||||
|
|
||||||
xbuffer[0] = x_ptr[0];
|
xbuffer[0] = x_ptr[0];
|
||||||
xbuffer[1] = x_ptr[1];
|
xbuffer[1] = x_ptr[1];
|
||||||
|
|
@ -427,8 +405,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
a_ptr += lda4;
|
a_ptr += lda4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for( i = 0; i < n2 ; i++)
|
for (i = 0; i < n2; i++) {
|
||||||
{
|
|
||||||
xbuffer[0] = x_ptr[0];
|
xbuffer[0] = x_ptr[0];
|
||||||
xbuffer[1] = x_ptr[1];
|
xbuffer[1] = x_ptr[1];
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
|
|
@ -444,21 +421,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
y_ptr += NB * inc_y;
|
y_ptr += NB * inc_y;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( m3 == 0 ) return(0);
|
if (m3 == 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( m3 == 1 )
|
if (m3 == 1) {
|
||||||
{
|
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
FLOAT temp_r = 0.0;
|
FLOAT temp_r = 0.0;
|
||||||
FLOAT temp_i = 0.0;
|
FLOAT temp_i = 0.0;
|
||||||
|
|
||||||
if ( lda == 2 && inc_x == 2 )
|
if (lda == 2 && inc_x == 2) {
|
||||||
{
|
|
||||||
|
|
||||||
|
for (i = 0; i < (n & -2); i += 2) {
|
||||||
for( i=0 ; i < (n & -2); i+=2 )
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
@ -475,10 +449,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += 4;
|
x_ptr += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (; i < n; i++) {
|
||||||
|
|
||||||
for( ; i < n; i++ )
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
@ -491,13 +462,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += 2;
|
x_ptr += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
for (i = 0; i < n; i++) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n; i++ )
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
@ -521,8 +488,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( m3 == 2 )
|
if (m3 == 2) {
|
||||||
{
|
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
FLOAT temp_r0 = 0.0;
|
FLOAT temp_r0 = 0.0;
|
||||||
|
|
@ -530,11 +496,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
FLOAT temp_r1 = 0.0;
|
FLOAT temp_r1 = 0.0;
|
||||||
FLOAT temp_i1 = 0.0;
|
FLOAT temp_i1 = 0.0;
|
||||||
|
|
||||||
if ( lda == 4 && inc_x == 2 )
|
if (lda == 4 && inc_x == 2) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < (n & -2); i+=2 )
|
for (i = 0; i < (n & -2); i += 2) {
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
|
|
||||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
|
|
@ -564,9 +528,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += 4;
|
x_ptr += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (; i < n; i++) {
|
||||||
for( ; i < n; i++ )
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
@ -583,13 +545,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += 2;
|
x_ptr += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
for (i = 0; i < n; i++) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i=0 ; i < n; i++ )
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
@ -606,7 +564,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
#if !defined(XCONJ)
|
#if !defined(XCONJ)
|
||||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||||
|
|
@ -624,9 +581,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m3 == 3) {
|
||||||
if ( m3 == 3 )
|
|
||||||
{
|
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
FLOAT temp_r0 = 0.0;
|
FLOAT temp_r0 = 0.0;
|
||||||
|
|
@ -636,11 +591,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
FLOAT temp_r2 = 0.0;
|
FLOAT temp_r2 = 0.0;
|
||||||
FLOAT temp_i2 = 0.0;
|
FLOAT temp_i2 = 0.0;
|
||||||
|
|
||||||
if ( lda == 6 && inc_x == 2 )
|
if (lda == 6 && inc_x == 2) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i=0 ; i < n; i++ )
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
@ -661,13 +614,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
||||||
x_ptr += 2;
|
x_ptr += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
}
|
for (i = 0; i < n; i++) {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n; i++ )
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2014, The OpenBLAS Project
|
Copyright (c) 2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -29,106 +29,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define NBMAX 1024
|
#define NBMAX 1024
|
||||||
|
|
||||||
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
|
||||||
{
|
FLOAT *alpha) {
|
||||||
__asm__ volatile (
|
register FLOAT *ap0 = ap[0];
|
||||||
"vzero %%v16 \n\t"
|
register FLOAT *ap1 = ap[1];
|
||||||
|
register FLOAT *ap2 = ap[2];
|
||||||
|
register FLOAT *ap3 = ap[3];
|
||||||
|
|
||||||
|
__asm__("vzero %%v16\n\t"
|
||||||
"vzero %%v17\n\t"
|
"vzero %%v17\n\t"
|
||||||
"vzero %%v18\n\t"
|
"vzero %%v18\n\t"
|
||||||
"vzero %%v19\n\t"
|
"vzero %%v19\n\t"
|
||||||
|
"vzero %%v20\n\t"
|
||||||
|
"vzero %%v21\n\t"
|
||||||
|
"vzero %%v22\n\t"
|
||||||
|
"vzero %%v23\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,1 \n\t"
|
"srlg %[n],%[n],1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||||
"pfd 1,1024(%%r1,%3) \n\t"
|
"pfd 1,1024(%%r1,%[ap2])\n\t"
|
||||||
"pfd 1,1024(%%r1,%4) \n\t"
|
"pfd 1,1024(%%r1,%[ap3])\n\t"
|
||||||
"pfd 1,1024(%%r1,%5) \n\t"
|
"pfd 1,1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v0,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,0(%%r1,%5) \n\t"
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vleg %%v21,8(%%r1,%5),0 \n\t"
|
"vleg %%v1,8(%%r1,%[x]),0\n\t"
|
||||||
"wflcdb %%v21,%%v21 \n\t"
|
"wflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v21,0(%%r1,%5),1 \n\t"
|
"vleg %%v1,0(%%r1,%[x]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v21,0(%%r1,%5),1 \n\t"
|
"vleg %%v1,0(%%r1,%[x]),1\n\t"
|
||||||
"vflcdb %%v21,%%v21 \n\t"
|
"vflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v21,8(%%r1,%5),0 \n\t"
|
"vleg %%v1,8(%%r1,%[x]),0\n\t"
|
||||||
#endif
|
#endif
|
||||||
|
"vlrepg %%v24,0(%%r1,%[ap0])\n\t"
|
||||||
"vlrepg %%v24,0(%%r1,%1) \n\t"
|
"vlrepg %%v25,8(%%r1,%[ap0])\n\t"
|
||||||
"vlrepg %%v25,8(%%r1,%1) \n\t"
|
"vlrepg %%v26,0(%%r1,%[ap1])\n\t"
|
||||||
"vlrepg %%v26,0(%%r1,%2) \n\t"
|
"vlrepg %%v27,8(%%r1,%[ap1])\n\t"
|
||||||
"vlrepg %%v27,8(%%r1,%2) \n\t"
|
"vlrepg %%v28,0(%%r1,%[ap2])\n\t"
|
||||||
|
"vlrepg %%v29,8(%%r1,%[ap2])\n\t"
|
||||||
"vfmadb %%v16,%%v24,%%v20,%%v16 \n\t"
|
"vlrepg %%v30,0(%%r1,%[ap3])\n\t"
|
||||||
"vfmadb %%v16,%%v25,%%v21,%%v16 \n\t"
|
"vlrepg %%v31,8(%%r1,%[ap3])\n\t"
|
||||||
"vfmadb %%v17,%%v26,%%v20,%%v17 \n\t"
|
"vfmadb %%v16,%%v24,%%v0,%%v16\n\t"
|
||||||
"vfmadb %%v17,%%v27,%%v21,%%v17 \n\t"
|
"vfmadb %%v20,%%v25,%%v1,%%v20\n\t"
|
||||||
|
"vfmadb %%v17,%%v26,%%v0,%%v17\n\t"
|
||||||
"vlrepg %%v28,0(%%r1,%3) \n\t"
|
"vfmadb %%v21,%%v27,%%v1,%%v21\n\t"
|
||||||
"vlrepg %%v29,8(%%r1,%3) \n\t"
|
"vfmadb %%v18,%%v28,%%v0,%%v18\n\t"
|
||||||
"vlrepg %%v30,0(%%r1,%4) \n\t"
|
"vfmadb %%v22,%%v29,%%v1,%%v22\n\t"
|
||||||
"vlrepg %%v31,8(%%r1,%4) \n\t"
|
"vfmadb %%v19,%%v30,%%v0,%%v19\n\t"
|
||||||
|
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
|
||||||
"vfmadb %%v18,%%v28,%%v20,%%v18 \n\t"
|
"vl %%v0,16(%%r1,%[x])\n\t"
|
||||||
"vfmadb %%v18,%%v29,%%v21,%%v18 \n\t"
|
|
||||||
"vfmadb %%v19,%%v30,%%v20,%%v19 \n\t"
|
|
||||||
"vfmadb %%v19,%%v31,%%v21,%%v19 \n\t"
|
|
||||||
|
|
||||||
"vl %%v22,16(%%r1,%5) \n\t"
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vleg %%v23,24(%%r1,%5),0 \n\t"
|
"vleg %%v1,24(%%r1,%[x]),0\n\t"
|
||||||
"wflcdb %%v23,%%v23 \n\t"
|
"wflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v23,16(%%r1,%5),1 \n\t"
|
"vleg %%v1,16(%%r1,%[x]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v23,16(%%r1,%5),1 \n\t"
|
"vleg %%v1,16(%%r1,%[x]),1\n\t"
|
||||||
"vflcdb %%v23,%%v23 \n\t"
|
"vflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v23,24(%%r1,%5),0 \n\t"
|
"vleg %%v1,24(%%r1,%[x]),0\n\t"
|
||||||
#endif
|
#endif
|
||||||
|
"vlrepg %%v24,16(%%r1,%[ap0])\n\t"
|
||||||
"vlrepg %%v24,16(%%r1,%1) \n\t"
|
"vlrepg %%v25,24(%%r1,%[ap0])\n\t"
|
||||||
"vlrepg %%v25,24(%%r1,%1) \n\t"
|
"vlrepg %%v26,16(%%r1,%[ap1])\n\t"
|
||||||
"vlrepg %%v26,16(%%r1,%2) \n\t"
|
"vlrepg %%v27,24(%%r1,%[ap1])\n\t"
|
||||||
"vlrepg %%v27,24(%%r1,%2) \n\t"
|
"vlrepg %%v28,16(%%r1,%[ap2])\n\t"
|
||||||
|
"vlrepg %%v29,24(%%r1,%[ap2])\n\t"
|
||||||
"vfmadb %%v16,%%v24,%%v22,%%v16 \n\t"
|
"vlrepg %%v30,16(%%r1,%[ap3])\n\t"
|
||||||
"vfmadb %%v16,%%v25,%%v23,%%v16 \n\t"
|
"vlrepg %%v31,24(%%r1,%[ap3])\n\t"
|
||||||
"vfmadb %%v17,%%v26,%%v22,%%v17 \n\t"
|
"vfmadb %%v16,%%v24,%%v0,%%v16\n\t"
|
||||||
"vfmadb %%v17,%%v27,%%v23,%%v17 \n\t"
|
"vfmadb %%v20,%%v25,%%v1,%%v20\n\t"
|
||||||
|
"vfmadb %%v17,%%v26,%%v0,%%v17\n\t"
|
||||||
"vlrepg %%v28,16(%%r1,%3) \n\t"
|
"vfmadb %%v21,%%v27,%%v1,%%v21\n\t"
|
||||||
"vlrepg %%v29,24(%%r1,%3) \n\t"
|
"vfmadb %%v18,%%v28,%%v0,%%v18\n\t"
|
||||||
"vlrepg %%v30,16(%%r1,%4) \n\t"
|
"vfmadb %%v22,%%v29,%%v1,%%v22\n\t"
|
||||||
"vlrepg %%v31,24(%%r1,%4) \n\t"
|
"vfmadb %%v19,%%v30,%%v0,%%v19\n\t"
|
||||||
|
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
|
||||||
"vfmadb %%v18,%%v28,%%v22,%%v18 \n\t"
|
|
||||||
"vfmadb %%v18,%%v29,%%v23,%%v18 \n\t"
|
|
||||||
"vfmadb %%v19,%%v30,%%v22,%%v19 \n\t"
|
|
||||||
"vfmadb %%v19,%%v31,%%v23,%%v19 \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,32\n\t"
|
"agfi %%r1,32\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
|
"vfadb %%v16,%%v16,%%v20\n\t"
|
||||||
|
"vfadb %%v17,%%v17,%%v21\n\t"
|
||||||
|
"vfadb %%v18,%%v18,%%v22\n\t"
|
||||||
|
"vfadb %%v19,%%v19,%%v23\n\t"
|
||||||
"vpdi %%v20,%%v16,%%v16,4\n\t"
|
"vpdi %%v20,%%v16,%%v16,4\n\t"
|
||||||
"vpdi %%v21,%%v17,%%v17,4\n\t"
|
"vpdi %%v21,%%v17,%%v17,4\n\t"
|
||||||
"vpdi %%v22,%%v18,%%v18,4\n\t"
|
"vpdi %%v22,%%v18,%%v18,4\n\t"
|
||||||
"vpdi %%v23,%%v19,%%v19,4\n\t"
|
"vpdi %%v23,%%v19,%%v19,4\n\t"
|
||||||
#if !defined(XCONJ)
|
#if !defined(XCONJ)
|
||||||
"vlrepg %%v24,0(%7) \n\t"
|
"vlrepg %%v24,0(%[alpha])\n\t"
|
||||||
"vleg %%v25,8(%7),0 \n\t"
|
"vleg %%v25,8(%[alpha]),0\n\t"
|
||||||
"wflcdb %%v25,%%v25\n\t"
|
"wflcdb %%v25,%%v25\n\t"
|
||||||
"vleg %%v25,8(%7),1 \n\t"
|
"vleg %%v25,8(%[alpha]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v24,0(%7),1 \n\t"
|
"vleg %%v24,0(%[alpha]),1\n\t"
|
||||||
"vflcdb %%v24,%%v24\n\t"
|
"vflcdb %%v24,%%v24\n\t"
|
||||||
"vleg %%v24,0(%7),0 \n\t"
|
"vleg %%v24,0(%[alpha]),0\n\t"
|
||||||
"vlrepg %%v25,8(%7) \n\t"
|
"vlrepg %%v25,8(%[alpha])\n\t"
|
||||||
#endif
|
#endif
|
||||||
"vl %%v26,0(%6) \n\t"
|
"vl %%v26,0(%[y])\n\t"
|
||||||
"vl %%v27,16(%6) \n\t"
|
"vl %%v27,16(%[y])\n\t"
|
||||||
"vl %%v28,32(%6) \n\t"
|
"vl %%v28,32(%[y])\n\t"
|
||||||
"vl %%v29,48(%6) \n\t"
|
"vl %%v29,48(%[y])\n\t"
|
||||||
"vfmadb %%v26,%%v16,%%v24,%%v26\n\t"
|
"vfmadb %%v26,%%v16,%%v24,%%v26\n\t"
|
||||||
"vfmadb %%v26,%%v20,%%v25,%%v26\n\t"
|
"vfmadb %%v26,%%v20,%%v25,%%v26\n\t"
|
||||||
"vfmadb %%v27,%%v17,%%v24,%%v27\n\t"
|
"vfmadb %%v27,%%v17,%%v24,%%v27\n\t"
|
||||||
|
|
@ -137,174 +137,173 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *
|
||||||
"vfmadb %%v28,%%v22,%%v25,%%v28\n\t"
|
"vfmadb %%v28,%%v22,%%v25,%%v28\n\t"
|
||||||
"vfmadb %%v29,%%v19,%%v24,%%v29\n\t"
|
"vfmadb %%v29,%%v19,%%v24,%%v29\n\t"
|
||||||
"vfmadb %%v29,%%v23,%%v25,%%v29\n\t"
|
"vfmadb %%v29,%%v23,%%v25,%%v29\n\t"
|
||||||
"vst %%v26,0(%6) \n\t"
|
"vst %%v26,0(%[y])\n\t"
|
||||||
"vst %%v27,16(%6) \n\t"
|
"vst %%v27,16(%[y])\n\t"
|
||||||
"vst %%v28,32(%6) \n\t"
|
"vst %%v28,32(%[y])\n\t"
|
||||||
"vst %%v29,48(%6) "
|
"vst %%v29,48(%[y])"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[8])y),"ZQ"((const FLOAT (*)[2])alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
|
||||||
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
|
||||||
);
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2),
|
||||||
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3),
|
||||||
|
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||||
|
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
|
||||||
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
|
||||||
{
|
FLOAT *alpha) {
|
||||||
__asm__ volatile (
|
register FLOAT *ap0 = ap[0];
|
||||||
"vzero %%v16 \n\t"
|
register FLOAT *ap1 = ap[1];
|
||||||
|
|
||||||
|
__asm__("vzero %%v16\n\t"
|
||||||
"vzero %%v17\n\t"
|
"vzero %%v17\n\t"
|
||||||
|
"vzero %%v18\n\t"
|
||||||
|
"vzero %%v19\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,1 \n\t"
|
"srlg %[n],%[n],1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap0])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[ap1])\n\t"
|
||||||
"pfd 1,1024(%%r1,%3) \n\t"
|
"pfd 1,1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v0,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,0(%%r1,%3) \n\t"
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vleg %%v19,8(%%r1,%3),0 \n\t"
|
"vleg %%v1,8(%%r1,%[x]),0\n\t"
|
||||||
"wflcdb %%v19,%%v19 \n\t"
|
"wflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v19,0(%%r1,%3),1 \n\t"
|
"vleg %%v1,0(%%r1,%[x]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v19,0(%%r1,%3),1 \n\t"
|
"vleg %%v1,0(%%r1,%[x]),1\n\t"
|
||||||
"vflcdb %%v19,%%v19 \n\t"
|
"vflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v19,8(%%r1,%3),0 \n\t"
|
"vleg %%v1,8(%%r1,%[x]),0\n\t"
|
||||||
#endif
|
#endif
|
||||||
|
"vlrepg %%v20,0(%%r1,%[ap0])\n\t"
|
||||||
"vlrepg %%v20,0(%%r1,%1) \n\t"
|
"vlrepg %%v21,8(%%r1,%[ap0])\n\t"
|
||||||
"vlrepg %%v21,8(%%r1,%1) \n\t"
|
"vlrepg %%v22,0(%%r1,%[ap1])\n\t"
|
||||||
"vlrepg %%v22,0(%%r1,%2) \n\t"
|
"vlrepg %%v23,8(%%r1,%[ap1])\n\t"
|
||||||
"vlrepg %%v23,8(%%r1,%2) \n\t"
|
"vfmadb %%v16,%%v20,%%v0,%%v16\n\t"
|
||||||
|
"vfmadb %%v18,%%v21,%%v1,%%v18\n\t"
|
||||||
"vfmadb %%v16,%%v20,%%v18,%%v16 \n\t"
|
"vfmadb %%v17,%%v22,%%v0,%%v17\n\t"
|
||||||
"vfmadb %%v16,%%v21,%%v19,%%v16 \n\t"
|
"vfmadb %%v19,%%v23,%%v1,%%v19\n\t"
|
||||||
"vfmadb %%v17,%%v22,%%v18,%%v17 \n\t"
|
"vl %%v0,16(%%r1,%[x])\n\t"
|
||||||
"vfmadb %%v17,%%v23,%%v19,%%v17 \n\t"
|
|
||||||
|
|
||||||
"vl %%v18,16(%%r1,%3) \n\t"
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vleg %%v19,24(%%r1,%3),0 \n\t"
|
"vleg %%v1,24(%%r1,%[x]),0\n\t"
|
||||||
"wflcdb %%v19,%%v19 \n\t"
|
"wflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v19,16(%%r1,%3),1 \n\t"
|
"vleg %%v1,16(%%r1,%[x]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v19,16(%%r1,%3),1 \n\t"
|
"vleg %%v1,16(%%r1,%[x]),1\n\t"
|
||||||
"vflcdb %%v19,%%v19 \n\t"
|
"vflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v19,24(%%r1,%3),0 \n\t"
|
"vleg %%v1,24(%%r1,%[x]),0\n\t"
|
||||||
#endif
|
#endif
|
||||||
|
"vlrepg %%v20,16(%%r1,%[ap0])\n\t"
|
||||||
"vlrepg %%v20,16(%%r1,%1) \n\t"
|
"vlrepg %%v21,24(%%r1,%[ap0])\n\t"
|
||||||
"vlrepg %%v21,24(%%r1,%1) \n\t"
|
"vlrepg %%v22,16(%%r1,%[ap1])\n\t"
|
||||||
"vlrepg %%v22,16(%%r1,%2) \n\t"
|
"vlrepg %%v23,24(%%r1,%[ap1])\n\t"
|
||||||
"vlrepg %%v23,24(%%r1,%2) \n\t"
|
"vfmadb %%v16,%%v20,%%v0,%%v16\n\t"
|
||||||
|
"vfmadb %%v18,%%v21,%%v1,%%v18\n\t"
|
||||||
"vfmadb %%v16,%%v20,%%v18,%%v16 \n\t"
|
"vfmadb %%v17,%%v22,%%v0,%%v17\n\t"
|
||||||
"vfmadb %%v16,%%v21,%%v19,%%v16 \n\t"
|
"vfmadb %%v19,%%v23,%%v1,%%v19\n\t"
|
||||||
"vfmadb %%v17,%%v22,%%v18,%%v17 \n\t"
|
|
||||||
"vfmadb %%v17,%%v23,%%v19,%%v17 \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,32\n\t"
|
"agfi %%r1,32\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
|
"vfadb %%v16,%%v16,%%v18\n\t"
|
||||||
|
"vfadb %%v17,%%v17,%%v19\n\t"
|
||||||
"vpdi %%v18,%%v16,%%v16,4\n\t"
|
"vpdi %%v18,%%v16,%%v16,4\n\t"
|
||||||
"vpdi %%v19,%%v17,%%v17,4\n\t"
|
"vpdi %%v19,%%v17,%%v17,4\n\t"
|
||||||
#if !defined(XCONJ)
|
#if !defined(XCONJ)
|
||||||
"vlrepg %%v20,0(%5) \n\t"
|
"vlrepg %%v20,0(%[alpha])\n\t"
|
||||||
"vleg %%v21,8(%5),0 \n\t"
|
"vleg %%v21,8(%[alpha]),0\n\t"
|
||||||
"wflcdb %%v21,%%v21\n\t"
|
"wflcdb %%v21,%%v21\n\t"
|
||||||
"vleg %%v21,8(%5),1 \n\t"
|
"vleg %%v21,8(%[alpha]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v20,0(%5),1 \n\t"
|
"vleg %%v20,0(%[alpha]),1\n\t"
|
||||||
"vflcdb %%v20,%%v20\n\t"
|
"vflcdb %%v20,%%v20\n\t"
|
||||||
"vleg %%v20,0(%5),0 \n\t"
|
"vleg %%v20,0(%[alpha]),0\n\t"
|
||||||
"vlrepg %%v21,8(%5) \n\t"
|
"vlrepg %%v21,8(%[alpha])\n\t"
|
||||||
#endif
|
#endif
|
||||||
"vl %%v22,0(%4) \n\t"
|
"vl %%v22,0(%[y])\n\t"
|
||||||
"vl %%v23,16(%4) \n\t"
|
"vl %%v23,16(%[y])\n\t"
|
||||||
"vfmadb %%v22,%%v16,%%v20,%%v22\n\t"
|
"vfmadb %%v22,%%v16,%%v20,%%v22\n\t"
|
||||||
"vfmadb %%v22,%%v18,%%v21,%%v22\n\t"
|
"vfmadb %%v22,%%v18,%%v21,%%v22\n\t"
|
||||||
"vfmadb %%v23,%%v17,%%v20,%%v23\n\t"
|
"vfmadb %%v23,%%v17,%%v20,%%v23\n\t"
|
||||||
"vfmadb %%v23,%%v19,%%v21,%%v23\n\t"
|
"vfmadb %%v23,%%v19,%%v21,%%v23\n\t"
|
||||||
"vst %%v22,0(%4) \n\t"
|
"vst %%v22,0(%[y])\n\t"
|
||||||
"vst %%v23,16(%4) \n\t"
|
"vst %%v23,16(%[y])\n\t"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[4])y),"ZQ"((const FLOAT (*)[2])alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0),
|
||||||
:"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23"
|
"m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1),
|
||||||
);
|
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||||
|
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
|
||||||
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y,
|
||||||
{
|
FLOAT *alpha) {
|
||||||
__asm__ volatile (
|
__asm__("vzero %%v16\n\t"
|
||||||
"vzero %%v16 \n\t"
|
"vzero %%v17\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"srlg %%r0,%0,1 \n\t"
|
"srlg %[n],%[n],1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 1,1024(%%r1,%1) \n\t"
|
"pfd 1,1024(%%r1,%[ap])\n\t"
|
||||||
"pfd 1,1024(%%r1,%2) \n\t"
|
"pfd 1,1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v0,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,0(%%r1,%2) \n\t"
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vleg %%v18,8(%%r1,%2),0 \n\t"
|
"vleg %%v1,8(%%r1,%[x]),0\n\t"
|
||||||
"wflcdb %%v18,%%v18 \n\t"
|
"wflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v18,0(%%r1,%2),1 \n\t"
|
"vleg %%v1,0(%%r1,%[x]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v18,0(%%r1,%2),1 \n\t"
|
"vleg %%v1,0(%%r1,%[x]),1\n\t"
|
||||||
"vflcdb %%v18,%%v18 \n\t"
|
"vflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v18,8(%%r1,%2),0 \n\t"
|
"vleg %%v1,8(%%r1,%[x]),0\n\t"
|
||||||
#endif
|
#endif
|
||||||
|
"vlrepg %%v18,0(%%r1,%[ap])\n\t"
|
||||||
"vlrepg %%v19,0(%%r1,%1) \n\t"
|
"vlrepg %%v19,8(%%r1,%[ap])\n\t"
|
||||||
"vlrepg %%v20,8(%%r1,%1) \n\t"
|
"vfmadb %%v16,%%v18,%%v0,%%v16\n\t"
|
||||||
|
"vfmadb %%v17,%%v19,%%v1,%%v17\n\t"
|
||||||
"vfmadb %%v16,%%v19,%%v17,%%v16 \n\t"
|
"vl %%v0,16(%%r1,%[x])\n\t"
|
||||||
"vfmadb %%v16,%%v20,%%v18,%%v16 \n\t"
|
|
||||||
|
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
"vleg %%v18,24(%%r1,%2),0 \n\t"
|
"vleg %%v1,24(%%r1,%[x]),0\n\t"
|
||||||
"wflcdb %%v18,%%v18 \n\t"
|
"wflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v18,16(%%r1,%2),1 \n\t"
|
"vleg %%v1,16(%%r1,%[x]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v18,16(%%r1,%2),1 \n\t"
|
"vleg %%v1,16(%%r1,%[x]),1\n\t"
|
||||||
"vflcdb %%v18,%%v18 \n\t"
|
"vflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v18,24(%%r1,%2),0 \n\t"
|
"vleg %%v1,24(%%r1,%[x]),0\n\t"
|
||||||
#endif
|
#endif
|
||||||
|
"vlrepg %%v18,16(%%r1,%[ap])\n\t"
|
||||||
"vlrepg %%v19,16(%%r1,%1) \n\t"
|
"vlrepg %%v19,24(%%r1,%[ap])\n\t"
|
||||||
"vlrepg %%v20,24(%%r1,%1) \n\t"
|
"vfmadb %%v16,%%v18,%%v0,%%v16\n\t"
|
||||||
|
"vfmadb %%v17,%%v19,%%v1,%%v17\n\t"
|
||||||
"vfmadb %%v16,%%v19,%%v17,%%v16 \n\t"
|
|
||||||
"vfmadb %%v16,%%v20,%%v18,%%v16 \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,32\n\t"
|
"agfi %%r1,32\n\t"
|
||||||
"brctg %%r0,0b \n\t"
|
"brctg %[n],0b\n\t"
|
||||||
|
"vfadb %%v16,%%v16,%%v17\n\t"
|
||||||
"vpdi %%v17,%%v16,%%v16,4\n\t"
|
"vpdi %%v17,%%v16,%%v16,4\n\t"
|
||||||
#if !defined(XCONJ)
|
#if !defined(XCONJ)
|
||||||
"vlrepg %%v18,0(%4) \n\t"
|
"vlrepg %%v18,0(%[alpha])\n\t"
|
||||||
"vleg %%v19,8(%4),0 \n\t"
|
"vleg %%v19,8(%[alpha]),0\n\t"
|
||||||
"wflcdb %%v19,%%v19\n\t"
|
"wflcdb %%v19,%%v19\n\t"
|
||||||
"vleg %%v19,8(%4),1 \n\t"
|
"vleg %%v19,8(%[alpha]),1\n\t"
|
||||||
#else
|
#else
|
||||||
"vleg %%v18,0(%4),1 \n\t"
|
"vleg %%v18,0(%[alpha]),1\n\t"
|
||||||
"vflcdb %%v18,%%v18\n\t"
|
"vflcdb %%v18,%%v18\n\t"
|
||||||
"vleg %%v18,0(%4),0 \n\t"
|
"vleg %%v18,0(%[alpha]),0\n\t"
|
||||||
"vlrepg %%v19,8(%4) \n\t"
|
"vlrepg %%v19,8(%[alpha])\n\t"
|
||||||
#endif
|
#endif
|
||||||
"vl %%v20,0(%3) \n\t"
|
"vl %%v0,0(%[y])\n\t"
|
||||||
"vfmadb %%v20,%%v16,%%v18,%%v20 \n\t"
|
"vfmadb %%v0,%%v16,%%v18,%%v0\n\t"
|
||||||
"vfmadb %%v20,%%v17,%%v19,%%v20 \n\t"
|
"vfmadb %%v0,%%v17,%%v19,%%v0\n\t"
|
||||||
"vst %%v20,0(%3) \n\t"
|
"vst %%v0,0(%[y])\n\t"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[2])y),"ZQ"((const FLOAT (*)[2])alpha)
|
: [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap),
|
||||||
:"memory","cc","r0","r1","v16","v17","v18","v19","v20"
|
"m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x),
|
||||||
);
|
"m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha)
|
||||||
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
|
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
||||||
{
|
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
for ( i=0; i<n; i++ )
|
for (i = 0; i < n; i++) {
|
||||||
{
|
|
||||||
*dest = *src;
|
*dest = *src;
|
||||||
*(dest + 1) = *(src + 1);
|
*(dest + 1) = *(src + 1);
|
||||||
dest += 2;
|
dest += 2;
|
||||||
|
|
@ -312,8 +311,9 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
{
|
FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||||
|
BLASLONG inc_y, FLOAT *buffer) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
BLASLONG j;
|
BLASLONG j;
|
||||||
FLOAT *a_ptr;
|
FLOAT *a_ptr;
|
||||||
|
|
@ -329,8 +329,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
FLOAT ybuffer[8], *xbuffer;
|
FLOAT ybuffer[8], *xbuffer;
|
||||||
FLOAT alpha[2];
|
FLOAT alpha[2];
|
||||||
|
|
||||||
if ( m < 1 ) return(0);
|
if (m < 1)
|
||||||
if ( n < 1 ) return(0);
|
return (0);
|
||||||
|
if (n < 1)
|
||||||
|
return (0);
|
||||||
|
|
||||||
inc_x <<= 1;
|
inc_x <<= 1;
|
||||||
inc_y <<= 1;
|
inc_y <<= 1;
|
||||||
|
|
@ -351,13 +353,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
|
||||||
BLASLONG NB = NBMAX;
|
BLASLONG NB = NBMAX;
|
||||||
|
|
||||||
while ( NB == NBMAX )
|
while (NB == NBMAX) {
|
||||||
{
|
|
||||||
|
|
||||||
m1 -= NB;
|
m1 -= NB;
|
||||||
if ( m1 < 0)
|
if (m1 < 0) {
|
||||||
{
|
if (m2 == 0)
|
||||||
if ( m2 == 0 ) break;
|
break;
|
||||||
NB = m2;
|
NB = m2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -373,11 +374,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
else
|
else
|
||||||
xbuffer = x_ptr;
|
xbuffer = x_ptr;
|
||||||
|
|
||||||
if ( inc_y == 2 )
|
if (inc_y == 2) {
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n1 ; i++)
|
for (i = 0; i < n1; i++) {
|
||||||
{
|
|
||||||
zgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha);
|
zgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha);
|
||||||
ap[0] += lda4;
|
ap[0] += lda4;
|
||||||
ap[1] += lda4;
|
ap[1] += lda4;
|
||||||
|
|
@ -388,28 +387,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( n2 & 2 )
|
if (n2 & 2) {
|
||||||
{
|
|
||||||
zgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha);
|
zgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha);
|
||||||
a_ptr += lda * 2;
|
a_ptr += lda * 2;
|
||||||
y_ptr += 4;
|
y_ptr += 4;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( n2 & 1 )
|
if (n2 & 1) {
|
||||||
{
|
|
||||||
zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
|
zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
|
||||||
/* a_ptr += lda;
|
/* a_ptr += lda;
|
||||||
y_ptr += 2; */
|
y_ptr += 2; */
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
for( i = 0; i < n1 ; i++)
|
for (i = 0; i < n1; i++) {
|
||||||
{
|
|
||||||
memset(ybuffer, 0, sizeof(ybuffer));
|
memset(ybuffer, 0, sizeof(ybuffer));
|
||||||
zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha);
|
zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha);
|
||||||
ap[0] += lda4;
|
ap[0] += lda4;
|
||||||
|
|
@ -433,8 +427,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for( i = 0; i < n2 ; i++)
|
for (i = 0; i < n2; i++) {
|
||||||
{
|
|
||||||
memset(ybuffer, 0, sizeof(ybuffer));
|
memset(ybuffer, 0, sizeof(ybuffer));
|
||||||
zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha);
|
zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha);
|
||||||
a_ptr += lda;
|
a_ptr += lda;
|
||||||
|
|
@ -449,17 +442,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
x += NB * inc_x;
|
x += NB * inc_x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m3 == 0)
|
||||||
|
return (0);
|
||||||
if ( m3 == 0 ) return(0);
|
|
||||||
|
|
||||||
x_ptr = x;
|
x_ptr = x;
|
||||||
j = 0;
|
j = 0;
|
||||||
a_ptr = a;
|
a_ptr = a;
|
||||||
y_ptr = y;
|
y_ptr = y;
|
||||||
|
|
||||||
if ( m3 == 3 )
|
if (m3 == 3) {
|
||||||
{
|
|
||||||
|
|
||||||
FLOAT temp_r;
|
FLOAT temp_r;
|
||||||
FLOAT temp_i;
|
FLOAT temp_i;
|
||||||
|
|
@ -471,8 +462,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
x_ptr += inc_x;
|
x_ptr += inc_x;
|
||||||
FLOAT x4 = x_ptr[0];
|
FLOAT x4 = x_ptr[0];
|
||||||
FLOAT x5 = x_ptr[1];
|
FLOAT x5 = x_ptr[1];
|
||||||
while ( j < n)
|
while (j < n) {
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||||
|
|
@ -505,9 +495,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m3 == 2) {
|
||||||
if ( m3 == 2 )
|
|
||||||
{
|
|
||||||
|
|
||||||
FLOAT temp_r;
|
FLOAT temp_r;
|
||||||
FLOAT temp_i;
|
FLOAT temp_i;
|
||||||
|
|
@ -521,8 +509,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
FLOAT ar = alpha[0];
|
FLOAT ar = alpha[0];
|
||||||
FLOAT ai = alpha[1];
|
FLOAT ai = alpha[1];
|
||||||
|
|
||||||
while ( j < ( n & -2 ))
|
while (j < (n & -2)) {
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||||
|
|
@ -565,9 +552,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
j += 2;
|
j += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
while (j < n) {
|
||||||
while ( j < n)
|
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||||
|
|
@ -597,9 +582,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m3 == 1) {
|
||||||
if ( m3 == 1 )
|
|
||||||
{
|
|
||||||
|
|
||||||
FLOAT temp_r;
|
FLOAT temp_r;
|
||||||
FLOAT temp_i;
|
FLOAT temp_i;
|
||||||
|
|
@ -610,8 +593,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
FLOAT ar = alpha[0];
|
FLOAT ar = alpha[0];
|
||||||
FLOAT ai = alpha[1];
|
FLOAT ai = alpha[1];
|
||||||
|
|
||||||
while ( j < ( n & -2 ))
|
while (j < (n & -2)) {
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||||
|
|
@ -646,8 +628,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||||
j += 2;
|
j += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
while ( j < n)
|
while (j < n) {
|
||||||
{
|
|
||||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,25 +27,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) {
|
||||||
{
|
__asm__("vlrepg %%v0,%[c]\n\t"
|
||||||
__asm__ (
|
"vlrepg %%v1,%[s]\n\t"
|
||||||
"vlrepg %%v0,%3 \n\t"
|
"srlg %[n],%[n],4\n\t"
|
||||||
"vlrepg %%v1,%4 \n\t"
|
|
||||||
"srlg %%r0,%0,4 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
"vl %%v24, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
"vl %%v25, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
"vl %%v26, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
"vl %%v27, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
"vl %%v16, 0(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
"vl %%v17, 16(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
"vl %%v18, 32(%%r1,%[y])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
"vl %%v19, 48(%%r1,%[y])\n\t"
|
||||||
|
|
||||||
"vfmdb %%v28,%%v24,%%v0\n\t"
|
"vfmdb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmdb %%v29,%%v25,%%v0\n\t"
|
"vfmdb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -63,25 +60,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 0(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
"vst %%v29, 16(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
"vst %%v30, 32(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
"vst %%v31, 48(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
"vst %%v20, 0(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
"vst %%v21, 16(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
"vst %%v22, 32(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
"vst %%v23, 48(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
"vl %%v24, 64(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
"vl %%v26, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
"vl %%v27, 112(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
"vl %%v16, 64(%%r1,%[y])\n\t"
|
||||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
"vl %%v17, 80(%%r1,%[y])\n\t"
|
||||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
"vl %%v18, 96(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
"vl %%v19, 112(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
|
||||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmdb %%v28,%%v24,%%v0\n\t"
|
"vfmdb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmdb %%v29,%%v25,%%v0\n\t"
|
"vfmdb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -99,25 +93,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 64(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
"vst %%v29, 80(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
"vst %%v30, 96(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
"vst %%v31, 112(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
"vst %%v20, 64(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
"vst %%v21, 80(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
"vst %%v22, 96(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
"vst %%v23, 112(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
"vl %%v24, 128(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
"vl %%v26, 160(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
"vl %%v27, 176(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
"vl %%v16, 128(%%r1,%[y])\n\t"
|
||||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
"vl %%v17, 144(%%r1,%[y])\n\t"
|
||||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
"vl %%v18, 160(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
"vl %%v19, 176(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
|
||||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmdb %%v28,%%v24,%%v0\n\t"
|
"vfmdb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmdb %%v29,%%v25,%%v0\n\t"
|
"vfmdb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -135,25 +126,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 128(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
"vst %%v29, 144(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
"vst %%v30, 160(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
"vst %%v31, 176(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
"vst %%v20, 128(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
"vst %%v21, 144(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
"vst %%v22, 160(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
"vst %%v23, 176(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
"vl %%v24, 192(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v25, 208(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
"vl %%v26, 224(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
"vl %%v27, 240(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
"vl %%v16, 192(%%r1,%[y])\n\t"
|
||||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
"vl %%v17, 208(%%r1,%[y])\n\t"
|
||||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
"vl %%v18, 224(%%r1,%[y])\n\t"
|
||||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
"vl %%v19, 240(%%r1,%[y])\n\t"
|
||||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
|
||||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmdb %%v28,%%v24,%%v0\n\t"
|
"vfmdb %%v28,%%v24,%%v0\n\t"
|
||||||
"vfmdb %%v29,%%v25,%%v0\n\t"
|
"vfmdb %%v29,%%v25,%%v0\n\t"
|
||||||
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
"vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */
|
||||||
|
|
@ -171,40 +159,39 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||||
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */
|
||||||
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
"vfmadb %%v31,%%v19,%%v1,%%v31\n\t"
|
||||||
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
"vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */
|
||||||
|
"vst %%v28, 192(%%r1,%[x])\n\t"
|
||||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
"vst %%v29, 208(%%r1,%[x])\n\t"
|
||||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
"vst %%v30, 224(%%r1,%[x])\n\t"
|
||||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
"vst %%v31, 240(%%r1,%[x])\n\t"
|
||||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
"vst %%v20, 192(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
"vst %%v21, 208(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
"vst %%v22, 224(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
"vst %%v23, 240(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
|
||||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
|
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||||
{
|
FLOAT c, FLOAT s) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
FLOAT temp[2];
|
FLOAT temp[2];
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
BLASLONG inc_y2;
|
BLASLONG inc_y2;
|
||||||
|
|
||||||
if ( n <= 0 ) return(0);
|
if (n <= 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1) )
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -16;
|
BLASLONG n1 = n & -16;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
FLOAT cosa, sina;
|
FLOAT cosa, sina;
|
||||||
cosa = c;
|
cosa = c;
|
||||||
sina = s;
|
sina = s;
|
||||||
|
|
@ -213,8 +200,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
ix = 2 * n1;
|
ix = 2 * n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
temp[0] = c * x[ix] + s * y[ix];
|
temp[0] = c * x[ix] + s * y[ix];
|
||||||
temp[1] = c * x[ix + 1] + s * y[ix + 1];
|
temp[1] = c * x[ix + 1] + s * y[ix + 1];
|
||||||
y[ix] = c * y[ix] - s * x[ix];
|
y[ix] = c * y[ix] - s * x[ix];
|
||||||
|
|
@ -227,14 +213,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
inc_x2 = 2 * inc_x;
|
inc_x2 = 2 * inc_x;
|
||||||
inc_y2 = 2 * inc_y;
|
inc_y2 = 2 * inc_y;
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
temp[0] = c * x[ix] + s * y[iy];
|
temp[0] = c * x[ix] + s * y[iy];
|
||||||
temp[1] = c * x[ix + 1] + s * y[iy + 1];
|
temp[1] = c * x[ix + 1] + s * y[iy + 1];
|
||||||
y[iy] = c * y[iy] - s * x[ix];
|
y[iy] = c * y[iy] - s * x[ix];
|
||||||
|
|
@ -252,5 +234,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013 - 2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,26 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) {
|
||||||
{
|
__asm__("vlrepg %%v0,0(%[alpha])\n\t"
|
||||||
__asm__ volatile(
|
"vleg %%v1,8(%[alpha]),0\n\t"
|
||||||
"vlrepg %%v0,0(%1) \n\t"
|
|
||||||
"vleg %%v1,8(%1),0 \n\t"
|
|
||||||
"wflcdb %%v1,%%v1\n\t"
|
"wflcdb %%v1,%%v1\n\t"
|
||||||
"vleg %%v1,8(%1),1 \n\t"
|
"vleg %%v1,8(%[alpha]),1\n\t"
|
||||||
"srlg %%r0,%0,3 \n\t"
|
"srlg %[n],%[n],3\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
|
||||||
"vpdi %%v24,%%v16,%%v16,4\n\t"
|
"vpdi %%v24,%%v16,%%v16,4\n\t"
|
||||||
"vpdi %%v25,%%v17,%%v17,4\n\t"
|
"vpdi %%v25,%%v17,%%v17,4\n\t"
|
||||||
"vpdi %%v26,%%v18,%%v18,4\n\t"
|
"vpdi %%v26,%%v18,%%v18,4\n\t"
|
||||||
|
|
@ -55,7 +52,6 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||||
"vpdi %%v29,%%v21,%%v21,4\n\t"
|
"vpdi %%v29,%%v21,%%v21,4\n\t"
|
||||||
"vpdi %%v30,%%v22,%%v22,4\n\t"
|
"vpdi %%v30,%%v22,%%v22,4\n\t"
|
||||||
"vpdi %%v31,%%v23,%%v23,4\n\t"
|
"vpdi %%v31,%%v23,%%v23,4\n\t"
|
||||||
|
|
||||||
"vfmdb %%v16,%%v16,%%v0\n\t"
|
"vfmdb %%v16,%%v16,%%v0\n\t"
|
||||||
"vfmdb %%v17,%%v17,%%v0\n\t"
|
"vfmdb %%v17,%%v17,%%v0\n\t"
|
||||||
"vfmdb %%v18,%%v18,%%v0\n\t"
|
"vfmdb %%v18,%%v18,%%v0\n\t"
|
||||||
|
|
@ -72,43 +68,40 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||||
"vfmadb %%v21,%%v29,%%v1,%%v21\n\t"
|
"vfmadb %%v21,%%v29,%%v1,%%v21\n\t"
|
||||||
"vfmadb %%v22,%%v30,%%v1,%%v22\n\t"
|
"vfmadb %%v22,%%v30,%%v1,%%v22\n\t"
|
||||||
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
|
"vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
|
||||||
|
"vst %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vst %%v16,0(%%r1,%2) \n\t"
|
"vst %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vst %%v17,16(%%r1,%2) \n\t"
|
"vst %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vst %%v18,32(%%r1,%2) \n\t"
|
"vst %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vst %%v19,48(%%r1,%2) \n\t"
|
"vst %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vst %%v20,64(%%r1,%2) \n\t"
|
"vst %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vst %%v21,80(%%r1,%2) \n\t"
|
"vst %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vst %%v22,96(%%r1,%2) \n\t"
|
"vst %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vst %%v23,112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
|
||||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
|
||||||
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
[alpha] "a"(alpha)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||||
|
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
|
||||||
|
"v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
|
||||||
{
|
__asm__("vleg %%v0,8(%[alpha]),0\n\t"
|
||||||
__asm__ volatile(
|
|
||||||
"vleg %%v0,8(%1),0 \n\t"
|
|
||||||
"wflcdb %%v0,%%v0\n\t"
|
"wflcdb %%v0,%%v0\n\t"
|
||||||
"vleg %%v0,8(%1),1 \n\t"
|
"vleg %%v0,8(%[alpha]),1\n\t"
|
||||||
"srlg %%r0,%0,3 \n\t"
|
"srlg %[n],%[n],3\n\t"
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
|
||||||
"vpdi %%v16,%%v16,%%v16,4\n\t"
|
"vpdi %%v16,%%v16,%%v16,4\n\t"
|
||||||
"vpdi %%v17,%%v17,%%v17,4\n\t"
|
"vpdi %%v17,%%v17,%%v17,4\n\t"
|
||||||
"vpdi %%v18,%%v18,%%v18,4\n\t"
|
"vpdi %%v18,%%v18,%%v18,4\n\t"
|
||||||
|
|
@ -117,7 +110,6 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||||
"vpdi %%v21,%%v21,%%v21,4\n\t"
|
"vpdi %%v21,%%v21,%%v21,4\n\t"
|
||||||
"vpdi %%v22,%%v22,%%v22,4\n\t"
|
"vpdi %%v22,%%v22,%%v22,4\n\t"
|
||||||
"vpdi %%v23,%%v23,%%v23,4\n\t"
|
"vpdi %%v23,%%v23,%%v23,4\n\t"
|
||||||
|
|
||||||
"vfmdb %%v16,%%v16,%%v0\n\t"
|
"vfmdb %%v16,%%v16,%%v0\n\t"
|
||||||
"vfmdb %%v17,%%v17,%%v0\n\t"
|
"vfmdb %%v17,%%v17,%%v0\n\t"
|
||||||
"vfmdb %%v18,%%v18,%%v0\n\t"
|
"vfmdb %%v18,%%v18,%%v0\n\t"
|
||||||
|
|
@ -126,42 +118,37 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||||
"vfmdb %%v21,%%v21,%%v0\n\t"
|
"vfmdb %%v21,%%v21,%%v0\n\t"
|
||||||
"vfmdb %%v22,%%v22,%%v0\n\t"
|
"vfmdb %%v22,%%v22,%%v0\n\t"
|
||||||
"vfmdb %%v23,%%v23,%%v0\n\t"
|
"vfmdb %%v23,%%v23,%%v0\n\t"
|
||||||
|
"vst %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vst %%v16,0(%%r1,%2) \n\t"
|
"vst %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vst %%v17,16(%%r1,%2) \n\t"
|
"vst %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vst %%v18,32(%%r1,%2) \n\t"
|
"vst %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vst %%v19,48(%%r1,%2) \n\t"
|
"vst %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vst %%v20,64(%%r1,%2) \n\t"
|
"vst %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vst %%v21,80(%%r1,%2) \n\t"
|
"vst %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vst %%v22,96(%%r1,%2) \n\t"
|
"vst %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vst %%v23,112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
|
||||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
[alpha] "a"(alpha)
|
||||||
);
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
|
"v23");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
|
||||||
{
|
__asm__("vlrepg %%v0,0(%[alpha])\n\t"
|
||||||
__asm__ volatile(
|
"srlg %[n],%[n],3\n\t"
|
||||||
"vlrepg %%v0,0(%1) \n\t"
|
|
||||||
"srlg %%r0,%0,3 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16,0(%%r1,%2) \n\t"
|
"vl %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17,16(%%r1,%2) \n\t"
|
"vl %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18,32(%%r1,%2) \n\t"
|
"vl %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19,48(%%r1,%2) \n\t"
|
"vl %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20,64(%%r1,%2) \n\t"
|
"vl %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21,80(%%r1,%2) \n\t"
|
"vl %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22,96(%%r1,%2) \n\t"
|
"vl %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23,112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"vfmdb %%v16,%%v16,%%v0\n\t"
|
"vfmdb %%v16,%%v16,%%v0\n\t"
|
||||||
"vfmdb %%v17,%%v17,%%v0\n\t"
|
"vfmdb %%v17,%%v17,%%v0\n\t"
|
||||||
"vfmdb %%v18,%%v18,%%v0\n\t"
|
"vfmdb %%v18,%%v18,%%v0\n\t"
|
||||||
|
|
@ -170,55 +157,46 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||||
"vfmdb %%v21,%%v21,%%v0\n\t"
|
"vfmdb %%v21,%%v21,%%v0\n\t"
|
||||||
"vfmdb %%v22,%%v22,%%v0\n\t"
|
"vfmdb %%v22,%%v22,%%v0\n\t"
|
||||||
"vfmdb %%v23,%%v23,%%v0\n\t"
|
"vfmdb %%v23,%%v23,%%v0\n\t"
|
||||||
|
"vst %%v16,0(%%r1,%[x])\n\t"
|
||||||
"vst %%v16,0(%%r1,%2) \n\t"
|
"vst %%v17,16(%%r1,%[x])\n\t"
|
||||||
"vst %%v17,16(%%r1,%2) \n\t"
|
"vst %%v18,32(%%r1,%[x])\n\t"
|
||||||
"vst %%v18,32(%%r1,%2) \n\t"
|
"vst %%v19,48(%%r1,%[x])\n\t"
|
||||||
"vst %%v19,48(%%r1,%2) \n\t"
|
"vst %%v20,64(%%r1,%[x])\n\t"
|
||||||
"vst %%v20,64(%%r1,%2) \n\t"
|
"vst %%v21,80(%%r1,%[x])\n\t"
|
||||||
"vst %%v21,80(%%r1,%2) \n\t"
|
"vst %%v22,96(%%r1,%[x])\n\t"
|
||||||
"vst %%v22,96(%%r1,%2) \n\t"
|
"vst %%v23,112(%%r1,%[x])\n\t"
|
||||||
"vst %%v23,112(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
|
||||||
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
|
: [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha),
|
||||||
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
|
[alpha] "a"(alpha)
|
||||||
);
|
: "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
|
||||||
|
"v23");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x)
|
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
|
||||||
{
|
__asm__("vzero %%v0\n\t"
|
||||||
__asm__ volatile(
|
"srlg %[n],%[n],3\n\t"
|
||||||
"vzero %%v24 \n\t"
|
|
||||||
"vzero %%v25 \n\t"
|
|
||||||
"vzero %%v26 \n\t"
|
|
||||||
"vzero %%v27 \n\t"
|
|
||||||
"srlg %%r0,%0,3 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vst %%v0,0(%%r1,%[x])\n\t"
|
||||||
"vst %%v24,0(%%r1,%1) \n\t"
|
"vst %%v0,16(%%r1,%[x])\n\t"
|
||||||
"vst %%v25,16(%%r1,%1) \n\t"
|
"vst %%v0,32(%%r1,%[x])\n\t"
|
||||||
"vst %%v26,32(%%r1,%1) \n\t"
|
"vst %%v0,48(%%r1,%[x])\n\t"
|
||||||
"vst %%v27,48(%%r1,%1) \n\t"
|
"vst %%v0,64(%%r1,%[x])\n\t"
|
||||||
"vst %%v24,64(%%r1,%1) \n\t"
|
"vst %%v0,80(%%r1,%[x])\n\t"
|
||||||
"vst %%v25,80(%%r1,%1) \n\t"
|
"vst %%v0,96(%%r1,%[x])\n\t"
|
||||||
"vst %%v26,96(%%r1,%1) \n\t"
|
"vst %%v0,112(%%r1,%[x])\n\t"
|
||||||
"vst %%v27,112(%%r1,%1) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,128\n\t"
|
"agfi %%r1,128\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n)
|
||||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
|
: [x] "a"(x)
|
||||||
:"memory","cc","r0","r1","v24","v25","v26","v27"
|
: "cc", "r1", "v0");
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
|
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
|
||||||
{
|
BLASLONG inc_x) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
BLASLONG inc_x2 = 2 * inc_x;
|
BLASLONG inc_x2 = 2 * inc_x;
|
||||||
BLASLONG inc_x3 = inc_x2 + inc_x;
|
BLASLONG inc_x3 = inc_x2 + inc_x;
|
||||||
|
|
@ -226,8 +204,7 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
|
||||||
FLOAT da_r = alpha[0];
|
FLOAT da_r = alpha[0];
|
||||||
FLOAT da_i = alpha[1];
|
FLOAT da_i = alpha[1];
|
||||||
|
|
||||||
for (i = 0; i < n; i += 4)
|
for (i = 0; i < n; i += 4) {
|
||||||
{
|
|
||||||
t0 = da_r * x[0] - da_i * x[1];
|
t0 = da_r * x[0] - da_i * x[1];
|
||||||
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
|
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
|
||||||
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
|
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
|
||||||
|
|
@ -247,7 +224,9 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
|
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||||
|
BLASLONG dummy2) {
|
||||||
BLASLONG i = 0, j = 0;
|
BLASLONG i = 0, j = 0;
|
||||||
FLOAT temp0;
|
FLOAT temp0;
|
||||||
FLOAT temp1;
|
FLOAT temp1;
|
||||||
|
|
@ -307,13 +286,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
|
||||||
if (da_i == 0.0) {
|
if (da_i == 0.0) {
|
||||||
BLASLONG n1 = n & -2;
|
BLASLONG n1 = n & -2;
|
||||||
|
|
||||||
|
|
@ -368,7 +344,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -8;
|
BLASLONG n1 = n & -8;
|
||||||
if (n1 > 0) {
|
if (n1 > 0) {
|
||||||
|
|
||||||
|
|
@ -380,8 +355,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
zscal_kernel_8_zero(n1, x);
|
zscal_kernel_8_zero(n1, x);
|
||||||
else
|
else
|
||||||
zscal_kernel_8_zero_r(n1, alpha, x);
|
zscal_kernel_8_zero_r(n1, alpha, x);
|
||||||
else
|
else if (da_i == 0)
|
||||||
if (da_i == 0)
|
|
||||||
zscal_kernel_8_zero_i(n1, alpha, x);
|
zscal_kernel_8_zero_i(n1, alpha, x);
|
||||||
else
|
else
|
||||||
zscal_kernel_8(n1, alpha, x);
|
zscal_kernel_8(n1, alpha, x);
|
||||||
|
|
@ -390,7 +364,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||||
j = n1;
|
j = n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (da_r == 0.0) {
|
if (da_r == 0.0) {
|
||||||
|
|
||||||
if (da_i == 0.0) {
|
if (da_i == 0.0) {
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2013-2017, The OpenBLAS Project
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
|
@ -27,114 +27,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||||
{
|
__asm__("srlg %[n],%[n],4\n\t"
|
||||||
__asm__ volatile(
|
|
||||||
"srlg %%r0,%0,4 \n\t"
|
|
||||||
"xgr %%r1,%%r1\n\t"
|
"xgr %%r1,%%r1\n\t"
|
||||||
"0:\n\t"
|
"0:\n\t"
|
||||||
"pfd 2, 1024(%%r1,%1) \n\t"
|
"pfd 2, 1024(%%r1,%[x])\n\t"
|
||||||
"pfd 2, 1024(%%r1,%2) \n\t"
|
"pfd 2, 1024(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
"vl %%v24, 128(%%r1,%[x])\n\t"
|
||||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
"vl %%v25, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
"vl %%v26, 160(%%r1,%[x])\n\t"
|
||||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
"vl %%v27, 176(%%r1,%[x])\n\t"
|
||||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
"vl %%v28, 192(%%r1,%[x])\n\t"
|
||||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
"vl %%v29, 208(%%r1,%[x])\n\t"
|
||||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
"vl %%v30, 224(%%r1,%[x])\n\t"
|
||||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
"vl %%v31, 240(%%r1,%[x])\n\t"
|
||||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
"vl %%v0, 0(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v1, 16(%%r1,%[y])\n\t"
|
||||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
"vl %%v2, 32(%%r1,%[y])\n\t"
|
||||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
"vl %%v3, 48(%%r1,%[y])\n\t"
|
||||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
"vl %%v4, 64(%%r1,%[y])\n\t"
|
||||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
"vl %%v5, 80(%%r1,%[y])\n\t"
|
||||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
"vl %%v6, 96(%%r1,%[y])\n\t"
|
||||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
"vl %%v7, 112(%%r1,%[y])\n\t"
|
||||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
"vst %%v0, 0(%%r1,%[x])\n\t"
|
||||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
"vst %%v1, 16(%%r1,%[x])\n\t"
|
||||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
"vst %%v2, 32(%%r1,%[x])\n\t"
|
||||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
"vst %%v3, 48(%%r1,%[x])\n\t"
|
||||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
"vst %%v4, 64(%%r1,%[x])\n\t"
|
||||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
"vst %%v5, 80(%%r1,%[x])\n\t"
|
||||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
"vst %%v6, 96(%%r1,%[x])\n\t"
|
||||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
"vst %%v7, 112(%%r1,%[x])\n\t"
|
||||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
"vl %%v0, 128(%%r1,%[y])\n\t"
|
||||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
"vl %%v1, 144(%%r1,%[y])\n\t"
|
||||||
|
"vl %%v2, 160(%%r1,%[y])\n\t"
|
||||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
"vl %%v3, 176(%%r1,%[y])\n\t"
|
||||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
"vl %%v4, 192(%%r1,%[y])\n\t"
|
||||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
"vl %%v5, 208(%%r1,%[y])\n\t"
|
||||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
"vl %%v6, 224(%%r1,%[y])\n\t"
|
||||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
"vl %%v7, 240(%%r1,%[y])\n\t"
|
||||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
"vst %%v0, 128(%%r1,%[x])\n\t"
|
||||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
"vst %%v1, 144(%%r1,%[x])\n\t"
|
||||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
"vst %%v2, 160(%%r1,%[x])\n\t"
|
||||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
"vst %%v3, 176(%%r1,%[x])\n\t"
|
||||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
"vst %%v4, 192(%%r1,%[x])\n\t"
|
||||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
"vst %%v5, 208(%%r1,%[x])\n\t"
|
||||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
"vst %%v6, 224(%%r1,%[x])\n\t"
|
||||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
"vst %%v7, 240(%%r1,%[x])\n\t"
|
||||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
"vst %%v16, 0(%%r1,%[y])\n\t"
|
||||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
"vst %%v17, 16(%%r1,%[y])\n\t"
|
||||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
"vst %%v18, 32(%%r1,%[y])\n\t"
|
||||||
|
"vst %%v19, 48(%%r1,%[y])\n\t"
|
||||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
"vst %%v20, 64(%%r1,%[y])\n\t"
|
||||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
"vst %%v21, 80(%%r1,%[y])\n\t"
|
||||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
"vst %%v22, 96(%%r1,%[y])\n\t"
|
||||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
"vst %%v23, 112(%%r1,%[y])\n\t"
|
||||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
"vst %%v24, 128(%%r1,%[y])\n\t"
|
||||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
"vst %%v25, 144(%%r1,%[y])\n\t"
|
||||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
"vst %%v26, 160(%%r1,%[y])\n\t"
|
||||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
"vst %%v27, 176(%%r1,%[y])\n\t"
|
||||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
"vst %%v28, 192(%%r1,%[y])\n\t"
|
||||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
"vst %%v29, 208(%%r1,%[y])\n\t"
|
||||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
"vst %%v30, 224(%%r1,%[y])\n\t"
|
||||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
"vst %%v31, 240(%%r1,%[y])\n\t"
|
||||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
|
||||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
|
||||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
|
||||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
|
||||||
|
|
||||||
"agfi %%r1,256\n\t"
|
"agfi %%r1,256\n\t"
|
||||||
"brctg %%r0,0b "
|
"brctg %[n],0b"
|
||||||
:
|
: "+m"(*(struct { FLOAT x[n * 2]; } *) x),
|
||||||
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
|
"+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n)
|
||||||
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
: [x] "a"(x),[y] "a"(y)
|
||||||
);
|
: "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
|
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||||
|
"v27", "v28", "v29", "v30", "v31");
|
||||||
}
|
}
|
||||||
|
|
||||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
|
||||||
{
|
FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||||
|
FLOAT *dummy, BLASLONG dummy2) {
|
||||||
BLASLONG i = 0;
|
BLASLONG i = 0;
|
||||||
BLASLONG ix = 0, iy = 0;
|
BLASLONG ix = 0, iy = 0;
|
||||||
FLOAT temp[2];
|
FLOAT temp[2];
|
||||||
BLASLONG inc_x2, inc_y2;
|
BLASLONG inc_x2, inc_y2;
|
||||||
|
|
||||||
if ( n <= 0 ) return(0);
|
if (n <= 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
if ((inc_x == 1) && (inc_y == 1)) {
|
||||||
{
|
|
||||||
|
|
||||||
BLASLONG n1 = n & -16;
|
BLASLONG n1 = n & -16;
|
||||||
if ( n1 > 0 )
|
if (n1 > 0) {
|
||||||
{
|
|
||||||
zswap_kernel_16(n1, x, y);
|
zswap_kernel_16(n1, x, y);
|
||||||
i = n1;
|
i = n1;
|
||||||
ix = 2 * n1;
|
ix = 2 * n1;
|
||||||
iy = 2 * n1;
|
iy = 2 * n1;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
temp[0] = x[ix];
|
temp[0] = x[ix];
|
||||||
temp[1] = x[ix + 1];
|
temp[1] = x[ix + 1];
|
||||||
|
|
@ -147,19 +141,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
|
||||||
iy += 2;
|
iy += 2;
|
||||||
i++;
|
i++;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
|
|
||||||
inc_x2 = 2 * inc_x;
|
inc_x2 = 2 * inc_x;
|
||||||
inc_y2 = 2 * inc_y;
|
inc_y2 = 2 * inc_y;
|
||||||
|
|
||||||
while(i < n)
|
while (i < n) {
|
||||||
{
|
|
||||||
|
|
||||||
temp[0] = x[ix];
|
temp[0] = x[ix];
|
||||||
temp[1] = x[ix + 1];
|
temp[1] = x[ix + 1];
|
||||||
|
|
@ -177,7 +166,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
|
||||||
}
|
}
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue