small fixes, some (i(dz)amin,i(dz)amax,(dz)dot,(dz)asum) mikrokernels can be inlined

This commit is contained in:
Abdelrauf 2017-10-13 19:29:27 +04:00 committed by QWR QWR
parent def146efed
commit 87669d1c0a
9 changed files with 402 additions and 354 deletions

View File

@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
static FLOAT __attribute__ ((noinline)) dasum_kernel_32(BLASLONG n, FLOAT *x) {
__asm__ ( static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
"pfd 1, 0(%1) \n\t" FLOAT asum ;
"sllg %%r0,%0,3 \n\t" __asm__ (
"agr %%r0,%1 \n\t" "pfd 1, 0(%3) \n\t"
"sllg %%r0,%2,3 \n\t"
"agr %%r0,%3 \n\t"
"vzero %%v0 \n\t" "vzero %%v0 \n\t"
"vzero %%v1 \n\t" "vzero %%v1 \n\t"
"vzero %%v2 \n\t" "vzero %%v2 \n\t"
@ -95,15 +97,18 @@ static FLOAT __attribute__ ((noinline)) dasum_kernel_32(BLASLONG n, FLOAT *x) {
"vfadb %%v0,%%v25,%%v24 \n\t" "vfadb %%v0,%%v25,%%v24 \n\t"
"vrepg %%v1,%%v0,1 \n\t" "vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t" "adbr %%f0,%%f1 \n\t"
: "ldr %0,%%f0 \n\t"
: "r"(n), "a"(x) : "=f"(asum),"+&a"(x)
: "cc", "memory","r0","f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31" : "r"(n), "1"(x)
: "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
); );
return asum;
} }
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;

View File

@ -30,75 +30,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(Z13) #if defined(Z13)
static void __attribute__ ((noinline)) ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y)
{ {
FLOAT dot;
__asm__ volatile( __asm__ volatile(
"pfd 1, 0(%1) \n\t" "pfd 1, 0(%2) \n\t"
"pfd 1, 0(%2) \n\t" "pfd 1, 0(%3) \n\t"
"vzero %%v24 \n\t" "vzero %%v24 \n\t"
"vzero %%v25 \n\t" "vzero %%v25 \n\t"
"vzero %%v26 \n\t" "vzero %%v26 \n\t"
"vzero %%v27 \n\t" "vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t" "srlg %1,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t" "xgr %%r1,%%r1 \n\t"
".align 16 \n\t" ".align 16 \n\t"
"1: \n\t" "1: \n\t"
"pfd 1, 256(%%r1,%1) \n\t" "pfd 1, 256(%%r1,%2) \n\t"
"pfd 1, 256(%%r1,%2) \n\t" "pfd 1, 256(%%r1,%3) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v28, 0(%%r1,%2) \n\t" "vl %%v28, 0(%%r1,%3) \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vl %%v29, 16(%%r1,%2) \n\t" "vl %%v29, 16(%%r1,%3) \n\t"
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
"vl %%v30, 32(%%r1,%3) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
"vl %%v31, 48(%%r1,%3) \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vl %%v28, 64(%%r1,%3) \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vl %%v29, 80(%%r1,%3) \n\t"
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
"vl %%v30, 96(%%r1,%3) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
"vl %%v31, 112(%%r1,%3) \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
"vl %%v30, 32(%%r1,%2) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" "la %%r1,128(%%r1) \n\t"
"vl %%v31, 48(%%r1,%2) \n\t" "brctg %1,1b \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" "vfadb %%v24,%%v25,%%v24 \n\t"
"vfadb %%v24,%%v26,%%v24 \n\t"
"vl %%v16, 64(%%r1,%1) \n\t" "vfadb %%v24,%%v27,%%v24 \n\t"
"vl %%v17, 80(%%r1,%1) \n\t" "vrepg %%v1,%%v24,1 \n\t"
"vl %%v18, 96(%%r1,%1) \n\t" "vfadb %%v1,%%v24,%%v1 \n\t"
"vl %%v19, 112(%%r1,%1) \n\t" "ldr %0, %%f1 \n\t"
: "=f"(dot) ,"+&r"(n)
: "a"(x),"a"(y)
:"cc" , "r1","v16", "v17","v18","v19","v20","v21","v22","v23",
"v24","v25","v26","v27","v28","v29","v30","v31"
"vl %%v28, 64(%%r1,%2) \n\t" );
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" return dot;
"vl %%v29, 80(%%r1,%2) \n\t"
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
"vl %%v30, 96(%%r1,%2) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
"vl %%v31, 112(%%r1,%2) \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
"la %%r1,128(%%r1) \n\t"
"brctg %%r0,1b \n\t"
"vfadb %%v24,%%v25,%%v24 \n\t"
"vfadb %%v24,%%v26,%%v24 \n\t"
"vfadb %%v24,%%v27,%%v24 \n\t"
"vrepg %%v1,%%v24,1 \n\t"
"vfadb %%v1,%%v24,%%v1 \n\t"
" std %%f1,0(%3) \n\t"
:
:"r"(n),"a"(x),"a"(y),"a"(d)
:"cc" , "memory" ,"r0","r1","v16", "v17","v18","v19","v20","v21","v22","v23",
"v24","v25","v26","v27","v28","v29","v30","v31"
);
} }
#else #else
static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y )
{ {
BLASLONG register i = 0; BLASLONG register i = 0;
FLOAT dot = 0.0; FLOAT dot = 0.0;
@ -117,8 +118,8 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
i+=8 ; i+=8 ;
} }
*d += dot; return dot;
} }
#endif #endif
@ -136,9 +137,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{ {
BLASLONG n1 = n & -16; BLASLONG n1 = n & -16;
if ( n1 ) if ( n1 )
ddot_kernel_8(n1, x, y , &dot ); dot = ddot_kernel_8(n1, x, y );
i = n1; i = n1;
while(i < n) while(i < n)

View File

@ -186,9 +186,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *
#endif #endif
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{ {

View File

@ -37,29 +37,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
static BLASLONG __attribute__((noinline)) diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
/**
__asm__( * Find maximum index
"pfd 1, 0(%1) \n\t" * Warning: requirements n>0 and n % 32 == 0
"sllg %%r0,%0,3 \n\t" * @param n
"agr %%r0,%1 \n\t" * @param x pointer to the vector
"VLEIG %%v20,0,0 \n\t" * @param minf (out) maximum absolute value .( only for output )
"VLEIG %%v20,1,1 \n\t" * @return index
"VLEIG %%v21,2,0 \n\t" */
"VLEIG %%v21,3,1 \n\t" static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"VLEIG %%v22,4,0 \n\t" BLASLONG index;
"VLEIG %%v22,5,1 \n\t" __asm__(
"VLEIG %%v23,6,0 \n\t" "pfd 1, 0(%4) \n\t"
"VLEIG %%v23,7,1 \n\t" "sllg %%r0,%3,3 \n\t"
"VREPIG %%v4,8 \n\t" "agr %%r0,%4 \n\t"
"vleig %%v20,0,0 \n\t"
"vleig %%v20,1,1 \n\t"
"vleig %%v21,2,0 \n\t"
"vleig %%v21,3,1 \n\t"
"vleig %%v22,4,0 \n\t"
"vleig %%v22,5,1 \n\t"
"vleig %%v23,6,0 \n\t"
"vleig %%v23,7,1 \n\t"
"vrepig %%v4,8 \n\t"
"vzero %%v5 \n\t" "vzero %%v5 \n\t"
"vzero %%v18 \n\t" "vzero %%v18 \n\t"
"vzero %%v19 \n\t" "vzero %%v19 \n\t"
".align 16 \n\t" ".align 16 \n\t"
"1: \n\t" "1: \n\t"
"pfd 1, 256(%1 ) \n\t" "pfd 1, 256(%2 ) \n\t"
"vlm %%v24,%%v31, 0(%1 ) \n\t" "vlm %%v24,%%v31, 0(%2 ) \n\t"
"vflpdb %%v24, %%v24 \n\t" "vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t" "vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t" "vflpdb %%v26, %%v26 \n\t"
@ -68,7 +76,6 @@ static BLASLONG __attribute__((noinline)) diamax_kernel_32_TUNED(BLASLONG n, FLO
"vflpdb %%v29, %%v29 \n\t" "vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t" "vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t" "vflpdb %%v31, %%v31 \n\t"
"vfchdb %%v16,%%v25,%%v24 \n\t " "vfchdb %%v16,%%v25,%%v24 \n\t "
"vfchdb %%v17,%%v27,%%v26 \n\t " "vfchdb %%v17,%%v27,%%v26 \n\t "
"vsel %%v1,%%v21,%%v20,%%v16 \n\t" "vsel %%v1,%%v21,%%v20,%%v16 \n\t"
@ -82,30 +89,24 @@ static BLASLONG __attribute__((noinline)) diamax_kernel_32_TUNED(BLASLONG n, FLO
"vsel %%v26,%%v23,%%v22,%%v17 \n\t" "vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t" "vsel %%v27,%%v31,%%v30,%%v17 \n\t"
"vfchdb %%v28, %%v3,%%v0 \n\t" "vfchdb %%v28, %%v3,%%v0 \n\t"
"vfchdb %%v29,%%v27, %%v25 \n\t" "vfchdb %%v29,%%v27, %%v25 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t" "vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t" "vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t" "vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t" "vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"vag %%v1,%%v1,%%v5 \n\t"
"VAG %%v1,%%v1,%%v5 \n\t" "vag %%v24,%%v24,%%v5 \n\t"
"VAG %%v24,%%v24,%%v5 \n\t" "vag %%v24,%%v24,%%v4 \n\t"
"VAG %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16,%%v25 , %%v0 \n\t" "vfchdb %%v16,%%v25 , %%v0 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t" "vag %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t" "vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t" "vsel %%v28,%%v24,%%v1,%%v16 \n\t"
"vfchdb %%v17, %%v29,%%v18 \n\t" "vfchdb %%v17, %%v29,%%v18 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t" "vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t" "vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t" "vlm %%v24,%%v31,128(%2 ) \n\t"
"vlm %%v24,%%v31,128(%1 ) \n\t"
"vflpdb %%v24, %%v24 \n\t" "vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t" "vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t" "vflpdb %%v26, %%v26 \n\t"
@ -114,7 +115,6 @@ static BLASLONG __attribute__((noinline)) diamax_kernel_32_TUNED(BLASLONG n, FLO
"vflpdb %%v29, %%v29 \n\t" "vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t" "vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t" "vflpdb %%v31, %%v31 \n\t"
"vfchdb %%v16,%%v25,%%v24 \n\t " "vfchdb %%v16,%%v25,%%v24 \n\t "
"vfchdb %%v17,%%v27,%%v26 \n\t " "vfchdb %%v17,%%v27,%%v26 \n\t "
"vsel %%v1,%%v21,%%v20,%%v16 \n\t" "vsel %%v1,%%v21,%%v20,%%v16 \n\t"
@ -128,56 +128,53 @@ static BLASLONG __attribute__((noinline)) diamax_kernel_32_TUNED(BLASLONG n, FLO
"vsel %%v26,%%v23,%%v22,%%v17 \n\t" "vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t" "vsel %%v27,%%v31,%%v30,%%v17 \n\t"
"vfchdb %%v28, %%v3,%%v0 \n\t" "vfchdb %%v28, %%v3,%%v0 \n\t"
"vfchdb %%v29,%%v27, %%v25 \n\t" "vfchdb %%v29,%%v27, %%v25 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t" "vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t" "vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t" "vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t" "vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"vag %%v1,%%v1,%%v5 \n\t"
"VAG %%v1,%%v1,%%v5 \n\t" "vag %%v24,%%v24,%%v5 \n\t"
"VAG %%v24,%%v24,%%v5 \n\t" "la %2,256(%2) \n\t"
"la %1,256(%1) \n\t" "vag %%v24,%%v24,%%v4 \n\t"
"VAG %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16,%%v25 , %%v0 \n\t" "vfchdb %%v16,%%v25 , %%v0 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t" "vag %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t" "vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t" "vsel %%v28,%%v24,%%v1,%%v16 \n\t"
"vfchdb %%v17, %%v29,%%v18 \n\t" "vfchdb %%v17, %%v29,%%v18 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t" "vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t" "vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t" "clgrjl %2,%%r0,1b \n\t"
"clgrjl %1,%%r0,1b \n\t"
"vrepg %%v26,%%v18,1 \n\t" "vrepg %%v26,%%v18,1 \n\t"
"vrepg %%v5,%%v19,1 \n\t" "vrepg %%v5,%%v19,1 \n\t"
"wfcdb %%v26,%%v18 \n\t" "wfcdb %%v26,%%v18 \n\t"
"jne 2f \n\t" "jne 2f \n\t"
"VSTEG %%v18,0(%2),0 \n\t" "vsteg %%v18,%1,0 \n\t"
"VMNLG %%v1,%%v5,%%v19 \n\t" "vmnlg %%v1,%%v5,%%v19 \n\t"
"VLGVG %%r2,%%v1,0 \n\t" "vlgvg %0,%%v1,0 \n\t"
"br %%r14 \n\t" "j 3f \n\t"
"2: \n\t" "2: \n\t"
"wfchdb %%v16,%%v26,%%v18 \n\t" "wfchdb %%v16,%%v26,%%v18 \n\t"
"vsel %%v1,%%v5,%%v19,%%v16 \n\t" "vsel %%v1,%%v5,%%v19,%%v16 \n\t"
"vsel %%v0,%%v26,%%v18,%%v16 \n\t" "vsel %%v0,%%v26,%%v18,%%v16 \n\t"
"VLGVG %%r2,%%v1,0 \n\t" "vlgvg %0,%%v1,0 \n\t"
"std %%f0,0(%2) \n\t" "std %%f0,%1 \n\t"
"3: "
: : "=r"(index) ,"=m"(*maxf) , "+&a"(x)
: "r"(n), "a"(x), "a"(maxf) : "r"(n), "2"(x)
: "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", : "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
); );
return index;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;

View File

@ -37,28 +37,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
static BLASLONG __attribute__((noinline)) diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { /**
* Find minimum index
__asm__( * Warning: requirements n>0 and n % 32 == 0
"pfd 1, 0(%1) \n\t" * @param n
"sllg %%r0,%0,3 \n\t" * @param x pointer to the vector
"agr %%r0,%1 \n\t" * @param minf (out) minimum absolute value .( only for output )
"VLEIG %%v20,0,0 \n\t" * @return minimum index
"VLEIG %%v20,1,1 \n\t" */
"VLEIG %%v21,2,0 \n\t" static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"VLEIG %%v21,3,1 \n\t" BLASLONG index;
"VLEIG %%v22,4,0 \n\t" __asm__(
"VLEIG %%v22,5,1 \n\t" "pfd 1, 0(%4) \n\t"
"VLEIG %%v23,6,0 \n\t" "sllg %%r0,%3,3 \n\t"
"VLEIG %%v23,7,1 \n\t" "agr %%r0,%4 \n\t"
"VREPIG %%v4,8 \n\t" "vleig %%v20,0,0 \n\t"
"vzero %%v5 \n\t" "vleig %%v20,1,1 \n\t"
"vlrepg %%v18,0(%1) \n\t" "vleig %%v21,2,0 \n\t"
"vleig %%v21,3,1 \n\t"
"vleig %%v22,4,0 \n\t"
"vleig %%v22,5,1 \n\t"
"vleig %%v23,6,0 \n\t"
"vleig %%v23,7,1 \n\t"
"vrepig %%v4,8 \n\t"
"vlrepg %%v18,0(%4) \n\t"
"vzero %%v5 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vzero %%v19 \n\t" "vzero %%v19 \n\t"
".align 16 \n\t" ".align 16 \n\t"
"1: \n\t" "1: \n\t"
"pfd 1, 256(%1 ) \n\t" "pfd 1, 256(%2 ) \n\t"
"vlm %%v24,%%v31, 0(%1 ) \n\t" "vlm %%v24,%%v31, 0(%2 ) \n\t"
"vflpdb %%v24, %%v24 \n\t" "vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t" "vflpdb %%v25, %%v25 \n\t"
@ -90,12 +99,12 @@ static BLASLONG __attribute__((noinline)) diamin_kernel_32(BLASLONG n, FLOAT *x,
"vsel %%v24,%%v26,%%v24,%%v29 \n\t" "vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t" "vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"VAG %%v1,%%v1,%%v5 \n\t" "vag %%v1,%%v1,%%v5 \n\t"
"VAG %%v24,%%v24,%%v5 \n\t" "vag %%v24,%%v24,%%v5 \n\t"
"VAG %%v24,%%v24,%%v4 \n\t" "vag %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16, %%v0,%%v25 \n\t" "vfchdb %%v16, %%v0,%%v25 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t" "vag %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t" "vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t" "vsel %%v28,%%v24,%%v1,%%v16 \n\t"
@ -103,9 +112,9 @@ static BLASLONG __attribute__((noinline)) diamin_kernel_32(BLASLONG n, FLOAT *x,
"vsel %%v19,%%v28,%%v19,%%v17 \n\t" "vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t" "vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t" "vag %%v5,%%v5,%%v4 \n\t"
"vlm %%v24,%%v31,128(%1 ) \n\t" "vlm %%v24,%%v31,128(%2 ) \n\t"
"vflpdb %%v24, %%v24 \n\t" "vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t" "vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t" "vflpdb %%v26, %%v26 \n\t"
@ -136,13 +145,13 @@ static BLASLONG __attribute__((noinline)) diamin_kernel_32(BLASLONG n, FLOAT *x,
"vsel %%v24,%%v26,%%v24,%%v29 \n\t" "vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t" "vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"VAG %%v1,%%v1,%%v5 \n\t" "vag %%v1,%%v1,%%v5 \n\t"
"VAG %%v24,%%v24,%%v5 \n\t" "vag %%v24,%%v24,%%v5 \n\t"
"la %1,256(%1) \n\t" "la %2,256(%2) \n\t"
"VAG %%v24,%%v24,%%v4 \n\t" "vag %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16, %%v0,%%v25 \n\t" "vfchdb %%v16, %%v0,%%v25 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t" "vag %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t" "vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t" "vsel %%v28,%%v24,%%v1,%%v16 \n\t"
@ -150,51 +159,55 @@ static BLASLONG __attribute__((noinline)) diamin_kernel_32(BLASLONG n, FLOAT *x,
"vsel %%v19,%%v28,%%v19,%%v17 \n\t" "vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t" "vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t" "vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %1,%%r0,1b \n\t" "clgrjl %2,%%r0,1b \n\t"
"vrepg %%v26,%%v18,1 \n\t" "vrepg %%v26,%%v18,1 \n\t"
"vrepg %%v5,%%v19,1 \n\t" "vrepg %%v5,%%v19,1 \n\t"
"wfcdb %%v26,%%v18 \n\t" "wfcdb %%v26,%%v18 \n\t"
"jne 2f \n\t" "jne 2f \n\t"
"VSTEG %%v18,0(%2),0 \n\t" "vsteg %%v18,%1,0 \n\t"
"VMNLG %%v1,%%v5,%%v19 \n\t" "vmnlg %%v1,%%v5,%%v19 \n\t"
"VLGVG %%r2,%%v1,0 \n\t" "vlgvg %0,%%v1,0 \n\t"
"br %%r14 \n\t" "j 3f \n\t"
"2: \n\t" "2: \n\t"
"wfchdb %%v16,%%v18 ,%%v26 \n\t " "wfchdb %%v16,%%v18 ,%%v26 \n\t "
"vsel %%v1,%%v5,%%v19,%%v16 \n\t" "vsel %%v1,%%v5,%%v19,%%v16 \n\t"
"vsel %%v0,%%v26,%%v18,%%v16 \n\t" "vsel %%v0,%%v26,%%v18,%%v16 \n\t"
"VLGVG %%r2,%%v1,0 \n\t" "vlgvg %0,%%v1,0 \n\t"
"std %%f0,0(%2) \n\t" "std %%f0,%1 \n\t"
"3:"
: : "+r"(index) ,"=m"(*minf),"+&a"(x)
: "r"(n), "a"(x), "a"(maxf) : "r"(n), "2"(x)
: "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
); );
return index;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0; BLASLONG i = 0;
BLASLONG j = 0; BLASLONG j = 0;
BLASLONG ix = 0; BLASLONG ix = 0;
FLOAT minf = 0.0;
BLASLONG min = 0; BLASLONG min = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (min); if (n <= 0 || inc_x <= 0) return (min);
minf = ABS(x[0]); //index's not incremented,though it will make first comparision redundant
if (inc_x == 1) { if (inc_x == 1) {
BLASLONG n1 = n & -32; BLASLONG n1 = n & -32;
if (n1 > 0) { if (n1 > 0) {
min = diamin_kernel_32(n1, x, &minf); min = diamin_kernel_32(n1, x, &minf);
i = n1; i = n1;
} }

View File

@ -32,56 +32,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ABS fabs #define ABS fabs
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) #define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
/**
static BLASLONG __attribute__((noinline)) ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { * Find maximum index
* Warning: requirements n>0 and n % 8 == 0
* @param n
* @param x pointer to the vector
* @param minf (out) maximum absolute value .( only for output )
* @return index
*/
static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
__asm__( __asm__(
"pfd 1, 0(%1) \n\t" "pfd 1, 0(%4) \n\t"
"VLEIG %%v16,0,0 \n\t" "vleig %%v16,0,0 \n\t"
"VLEIG %%v16,1,1 \n\t" "vleig %%v16,1,1 \n\t"
"VLEIG %%v17,2,0 \n\t" "vleig %%v17,2,0 \n\t"
"VLEIG %%v17,3,1 \n\t" "vleig %%v17,3,1 \n\t"
"VLEIG %%v18,4,0 \n\t" "vleig %%v18,4,0 \n\t"
"VLEIG %%v18,5,1 \n\t" "vleig %%v18,5,1 \n\t"
"VLEIG %%v19,6,0 \n\t" "vleig %%v19,6,0 \n\t"
"VLEIG %%v19,7,1 \n\t" "vleig %%v19,7,1 \n\t"
"VLEIG %%v20,8,0 \n\t" "vleig %%v20,8,0 \n\t"
"VLEIG %%v20,9,1 \n\t" "vleig %%v20,9,1 \n\t"
"VLEIG %%v21,10,0 \n\t" "vleig %%v21,10,0 \n\t"
"VLEIG %%v21,11,1 \n\t" "vleig %%v21,11,1 \n\t"
"VLEIG %%v22,12,0 \n\t" "vleig %%v22,12,0 \n\t"
"VLEIG %%v22,13,1 \n\t" "vleig %%v22,13,1 \n\t"
"VLEIG %%v23,14,0 \n\t" "vleig %%v23,14,0 \n\t"
"VLEIG %%v23,15,1 \n\t" "vleig %%v23,15,1 \n\t"
"sllg %%r0,%0,4 \n\t" "sllg %%r0,%3,4 \n\t"
"agr %%r0,%1 \n\t" "agr %%r0,%4 \n\t"
"vzero %%v6 \n\t" "vzero %%v6 \n\t"
"vzero %%v7 \n\t" "vzero %%v7 \n\t"
"VREPIG %%v4,16 \n\t" "vrepig %%v4,16 \n\t"
"vzero %%v5 \n\t" "vzero %%v5 \n\t"
".align 16 \n\t" ".align 16 \n\t"
"1: \n\t" "1: \n\t"
"pfd 1, 256(%1 ) \n\t" "pfd 1, 256(%2 ) \n\t"
"vleg %%v24 , 0( %1),0 \n\t" "vleg %%v24 , 0( %2),0 \n\t"
"vleg %%v25 , 8( %1),0 \n\t" "vleg %%v25 , 8( %2),0 \n\t"
"vleg %%v24 , 16( %1),1 \n\t" "vleg %%v24 , 16( %2),1 \n\t"
"vleg %%v25 , 24( %1),1 \n\t" "vleg %%v25 , 24( %2),1 \n\t"
"vleg %%v26 , 32( %1),0 \n\t" "vleg %%v26 , 32( %2),0 \n\t"
"vleg %%v27 , 40( %1),0 \n\t" "vleg %%v27 , 40( %2),0 \n\t"
"vleg %%v26 , 48( %1),1 \n\t" "vleg %%v26 , 48( %2),1 \n\t"
"vleg %%v27 , 56( %1),1 \n\t" "vleg %%v27 , 56( %2),1 \n\t"
"vleg %%v28 , 64( %1),0 \n\t" "vleg %%v28 , 64( %2),0 \n\t"
"vleg %%v29 , 72( %1),0 \n\t" "vleg %%v29 , 72( %2),0 \n\t"
"vleg %%v28 , 80( %1),1 \n\t" "vleg %%v28 , 80( %2),1 \n\t"
"vleg %%v29 , 88( %1),1 \n\t" "vleg %%v29 , 88( %2),1 \n\t"
"vleg %%v30 , 96( %1),0 \n\t" "vleg %%v30 , 96( %2),0 \n\t"
"vleg %%v31 ,104( %1),0 \n\t" "vleg %%v31 ,104( %2),0 \n\t"
"vleg %%v30 ,112( %1),1 \n\t" "vleg %%v30 ,112( %2),1 \n\t"
"vleg %%v31 ,120( %1),1 \n\t" "vleg %%v31 ,120( %2),1 \n\t"
"vflpdb %%v24, %%v24 \n\t" "vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t" "vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t" "vflpdb %%v26, %%v26 \n\t"
@ -97,22 +106,22 @@ static BLASLONG __attribute__((noinline)) ziamax_kernel_8_TUNED(BLASLONG n, FLOA
"vfadb %%v3,%%v30,%%v31 \n\t" "vfadb %%v3,%%v30,%%v31 \n\t"
"vleg %%v24 , 128( %1),0 \n\t" "vleg %%v24 , 128( %2),0 \n\t"
"vleg %%v25 , 136( %1),0 \n\t" "vleg %%v25 , 136( %2),0 \n\t"
"vleg %%v24 , 144( %1),1 \n\t" "vleg %%v24 , 144( %2),1 \n\t"
"vleg %%v25 , 152( %1),1 \n\t" "vleg %%v25 , 152( %2),1 \n\t"
"vleg %%v26 , 160( %1),0 \n\t" "vleg %%v26 , 160( %2),0 \n\t"
"vleg %%v27 , 168( %1),0 \n\t" "vleg %%v27 , 168( %2),0 \n\t"
"vleg %%v26 , 176( %1),1 \n\t" "vleg %%v26 , 176( %2),1 \n\t"
"vleg %%v27 , 184( %1),1 \n\t" "vleg %%v27 , 184( %2),1 \n\t"
"vleg %%v28 , 192( %1),0 \n\t" "vleg %%v28 , 192( %2),0 \n\t"
"vleg %%v29 , 200( %1),0 \n\t" "vleg %%v29 , 200( %2),0 \n\t"
"vleg %%v28 , 208( %1),1 \n\t" "vleg %%v28 , 208( %2),1 \n\t"
"vleg %%v29 , 216( %1),1 \n\t" "vleg %%v29 , 216( %2),1 \n\t"
"vleg %%v30 , 224( %1),0 \n\t" "vleg %%v30 , 224( %2),0 \n\t"
"vleg %%v31 , 232( %1),0 \n\t" "vleg %%v31 , 232( %2),0 \n\t"
"vleg %%v30 , 240( %1),1 \n\t" "vleg %%v30 , 240( %2),1 \n\t"
"vleg %%v31 , 248( %1),1 \n\t" "vleg %%v31 , 248( %2),1 \n\t"
"vflpdb %%v24, %%v24 \n\t" "vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t" "vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t" "vflpdb %%v26, %%v26 \n\t"
@ -151,49 +160,49 @@ static BLASLONG __attribute__((noinline)) ziamax_kernel_8_TUNED(BLASLONG n, FLOA
"vsel %%v29,%%v25,%%v2,%%v30 \n\t" "vsel %%v29,%%v25,%%v2,%%v30 \n\t"
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
"la %1,256(%1) \n\t" "la %2,256(%2) \n\t"
"vfchdb %%v0, %%v31,%%v28 \n\t" "vfchdb %%v0, %%v31,%%v28 \n\t"
"vsel %%v25,%%v29,%%v26,%%v0 \n\t" "vsel %%v25,%%v29,%%v26,%%v0 \n\t"
"vsel %%v27,%%v31,%%v28,%%v0 \n\t" "vsel %%v27,%%v31,%%v28,%%v0 \n\t"
"VAG %%v25,%%v25,%%v5 \n\t" "vag %%v25,%%v25,%%v5 \n\t"
//cmp with previous //cmp with previous
"vfchdb %%v30, %%v27,%%v6 \n\t" "vfchdb %%v30, %%v27,%%v6 \n\t"
"vsel %%v7,%%v25,%%v7,%%v30 \n\t" "vsel %%v7,%%v25,%%v7,%%v30 \n\t"
"vsel %%v6,%%v27,%%v6,%%v30 \n\t" "vsel %%v6,%%v27,%%v6,%%v30 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t" "vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %1,%%r0,1b \n\t" "clgrjl %2,%%r0,1b \n\t"
//xtract index //xtract index
"vrepg %%v26,%%v6,1 \n\t" "vrepg %%v26,%%v6,1 \n\t"
"vrepg %%v5,%%v7,1 \n\t" "vrepg %%v5,%%v7,1 \n\t"
"wfcdb %%v26,%%v6 \n\t" "wfcdb %%v26,%%v6 \n\t"
"jne 2f \n\t" "jne 2f \n\t"
"VSTEG %%v6,0(%2),0 \n\t" "vsteg %%v6,%1,0 \n\t"
"VMNLG %%v1,%%v5,%%v7 \n\t" "vmnlg %%v1,%%v5,%%v7 \n\t"
"VLGVG %%r2,%%v1,0 \n\t" "vlgvg %0,%%v1,0 \n\t"
"br %%r14 \n\t" "j 3 \n\t"
"2: \n\t" "2: \n\t"
"wfchdb %%v16,%%v26,%%v6 \n\t" "wfchdb %%v16,%%v26,%%v6 \n\t"
"vsel %%v1,%%v5,%%v7,%%v16 \n\t" "vsel %%v1,%%v5,%%v7,%%v16 \n\t"
"vsel %%v0,%%v26,%%v6,%%v16 \n\t" "vsel %%v0,%%v26,%%v6,%%v16 \n\t"
"VLGVG %%r2,%%v1,0 \n\t" "vlgvg %0,%%v1,0 \n\t"
"std %%f0,0(%2) \n\t" "std %%f0,%1 \n\t"
"3: \n\t"
: : "=r"(index),"=m"(*maxf),"+&a"(x)
: "r"(n), "a"(x), "a"(maxf) : "r"(n), "2"(x)
: "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
); );
return index;
} }

View File

@ -33,58 +33,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) #define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
/**
static BLASLONG __attribute__((noinline)) ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { * Find minimum index
* Warning: requirements n>0 and n % 8 == 0
* @param n
* @param x pointer to the vector
* @param minf (out) minimum absolute value .( only for output )
* @return minimum index
*/
static BLASLONG ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index ;
__asm__( __asm__(
"pfd 1, 0(%1) \n\t" "pfd 1, 0(%4) \n\t"
"VLEIG %%v16,0,0 \n\t" "vleig %%v16,0,0 \n\t"
"VLEIG %%v16,1,1 \n\t" "vleig %%v16,1,1 \n\t"
"VLEIG %%v17,2,0 \n\t" "vleig %%v17,2,0 \n\t"
"VLEIG %%v17,3,1 \n\t" "vleig %%v17,3,1 \n\t"
"VLEIG %%v18,4,0 \n\t" "vleig %%v18,4,0 \n\t"
"VLEIG %%v18,5,1 \n\t" "vleig %%v18,5,1 \n\t"
"VLEIG %%v19,6,0 \n\t" "vleig %%v19,6,0 \n\t"
"VLEIG %%v19,7,1 \n\t" "vleig %%v19,7,1 \n\t"
"VLEIG %%v20,8,0 \n\t" "vleig %%v20,8,0 \n\t"
"VLEIG %%v20,9,1 \n\t" "vleig %%v20,9,1 \n\t"
"VLEIG %%v21,10,0 \n\t" "vleig %%v21,10,0 \n\t"
"VLEIG %%v21,11,1 \n\t" "vleig %%v21,11,1 \n\t"
"VLEIG %%v22,12,0 \n\t" "vleig %%v22,12,0 \n\t"
"VLEIG %%v22,13,1 \n\t" "vleig %%v22,13,1 \n\t"
"VLEIG %%v23,14,0 \n\t" "vleig %%v23,14,0 \n\t"
"VLEIG %%v23,15,1 \n\t" "vleig %%v23,15,1 \n\t"
"ld %%f6,0(%1) \n\t" "ld %%f6,0(%4) \n\t"
"lpdbr %%f6,%%f6 \n\t" "lpdbr %%f6,%%f6 \n\t"
"ld %%f7,8(%1) \n\t" "ld %%f7,8(%4) \n\t"
"lpdbr %%f7,%%f7 \n\t" "lpdbr %%f7,%%f7 \n\t"
"adbr %%f6,%%f7 \n\t" "adbr %%f6,%%f7 \n\t"
"sllg %%r0,%0,4 \n\t" "sllg %%r0,%3,4 \n\t"
"agr %%r0,%1 \n\t" "agr %%r0,%4 \n\t"
"vrepg %%v6,%%v6,0 \n\t" "vrepg %%v6,%%v6,0 \n\t"
"vzero %%v7 \n\t" "vzero %%v7 \n\t"
"VREPIG %%v4,16 \n\t" "vrepig %%v4,16 \n\t"
"vzero %%v5 \n\t" "vzero %%v5 \n\t"
".align 16 \n\t" ".align 16 \n\t"
"1: \n\t" "1: \n\t"
"pfd 1, 256(%1 ) \n\t" "pfd 1, 256(%2 ) \n\t"
"vleg %%v24 , 0( %1),0 \n\t" "vleg %%v24 , 0( %2),0 \n\t"
"vleg %%v25 , 8( %1),0 \n\t" "vleg %%v25 , 8( %2),0 \n\t"
"vleg %%v24 , 16( %1),1 \n\t" "vleg %%v24 , 16( %2),1 \n\t"
"vleg %%v25 , 24( %1),1 \n\t" "vleg %%v25 , 24( %2),1 \n\t"
"vleg %%v26 , 32( %1),0 \n\t" "vleg %%v26 , 32( %2),0 \n\t"
"vleg %%v27 , 40( %1),0 \n\t" "vleg %%v27 , 40( %2),0 \n\t"
"vleg %%v26 , 48( %1),1 \n\t" "vleg %%v26 , 48( %2),1 \n\t"
"vleg %%v27 , 56( %1),1 \n\t" "vleg %%v27 , 56( %2),1 \n\t"
"vleg %%v28 , 64( %1),0 \n\t" "vleg %%v28 , 64( %2),0 \n\t"
"vleg %%v29 , 72( %1),0 \n\t" "vleg %%v29 , 72( %2),0 \n\t"
"vleg %%v28 , 80( %1),1 \n\t" "vleg %%v28 , 80( %2),1 \n\t"
"vleg %%v29 , 88( %1),1 \n\t" "vleg %%v29 , 88( %2),1 \n\t"
"vleg %%v30 , 96( %1),0 \n\t" "vleg %%v30 , 96( %2),0 \n\t"
"vleg %%v31 ,104( %1),0 \n\t" "vleg %%v31 ,104( %2),0 \n\t"
"vleg %%v30 ,112( %1),1 \n\t" "vleg %%v30 ,112( %2),1 \n\t"
"vleg %%v31 ,120( %1),1 \n\t" "vleg %%v31 ,120( %2),1 \n\t"
"vflpdb %%v24, %%v24 \n\t" "vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t" "vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t" "vflpdb %%v26, %%v26 \n\t"
@ -100,22 +107,22 @@ static BLASLONG __attribute__((noinline)) ziamin_kernel_8_TUNED(BLASLONG n, FLOA
"vfadb %%v3,%%v30,%%v31 \n\t" "vfadb %%v3,%%v30,%%v31 \n\t"
"vleg %%v24 ,128( %1),0 \n\t" "vleg %%v24 ,128( %2),0 \n\t"
"vleg %%v25 ,136( %1),0 \n\t" "vleg %%v25 ,136( %2),0 \n\t"
"vleg %%v24 ,144( %1),1 \n\t" "vleg %%v24 ,144( %2),1 \n\t"
"vleg %%v25 ,152( %1),1 \n\t" "vleg %%v25 ,152( %2),1 \n\t"
"vleg %%v26 ,160( %1),0 \n\t" "vleg %%v26 ,160( %2),0 \n\t"
"vleg %%v27 ,168( %1),0 \n\t" "vleg %%v27 ,168( %2),0 \n\t"
"vleg %%v26 ,176( %1),1 \n\t" "vleg %%v26 ,176( %2),1 \n\t"
"vleg %%v27 ,184( %1),1 \n\t" "vleg %%v27 ,184( %2),1 \n\t"
"vleg %%v28 ,192( %1),0 \n\t" "vleg %%v28 ,192( %2),0 \n\t"
"vleg %%v29 ,200( %1),0 \n\t" "vleg %%v29 ,200( %2),0 \n\t"
"vleg %%v28 ,208( %1),1 \n\t" "vleg %%v28 ,208( %2),1 \n\t"
"vleg %%v29 ,216( %1),1 \n\t" "vleg %%v29 ,216( %2),1 \n\t"
"vleg %%v30 ,224( %1),0 \n\t" "vleg %%v30 ,224( %2),0 \n\t"
"vleg %%v31 ,232( %1),0 \n\t" "vleg %%v31 ,232( %2),0 \n\t"
"vleg %%v30 ,240( %1),1 \n\t" "vleg %%v30 ,240( %2),1 \n\t"
"vleg %%v31 ,248( %1),1 \n\t" "vleg %%v31 ,248( %2),1 \n\t"
"vflpdb %%v24, %%v24 \n\t" "vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t" "vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t" "vflpdb %%v26, %%v26 \n\t"
@ -155,49 +162,53 @@ static BLASLONG __attribute__((noinline)) ziamin_kernel_8_TUNED(BLASLONG n, FLOA
"vsel %%v29,%%v25,%%v2,%%v30 \n\t" "vsel %%v29,%%v25,%%v2,%%v30 \n\t"
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
"la %1,256(%1) \n\t" "la %2,256(%2) \n\t"
"vfchdb %%v0,%%v28, %%v31 \n\t" "vfchdb %%v0,%%v28, %%v31 \n\t"
"vsel %%v25,%%v29,%%v26,%%v0 \n\t" "vsel %%v25,%%v29,%%v26,%%v0 \n\t"
"vsel %%v27,%%v31,%%v28,%%v0 \n\t" "vsel %%v27,%%v31,%%v28,%%v0 \n\t"
"VAG %%v25,%%v25,%%v5 \n\t" "vag %%v25,%%v25,%%v5 \n\t"
//cmp with previous //cmp with previous
"vfchdb %%v30,%%v6 , %%v27 \n\t" "vfchdb %%v30,%%v6 , %%v27 \n\t"
"vsel %%v7,%%v25,%%v7,%%v30 \n\t" "vsel %%v7,%%v25,%%v7,%%v30 \n\t"
"vsel %%v6,%%v27,%%v6,%%v30 \n\t" "vsel %%v6,%%v27,%%v6,%%v30 \n\t"
"VAG %%v5,%%v5,%%v4 \n\t" "vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %1,%%r0,1b \n\t" "clgrjl %2,%%r0,1b \n\t"
//xtract index //xtract index
"vrepg %%v26,%%v6,1 \n\t" "vrepg %%v26,%%v6,1 \n\t"
"vrepg %%v5,%%v7,1 \n\t" "vrepg %%v5,%%v7,1 \n\t"
"wfcdb %%v26,%%v6 \n\t" "wfcdb %%v26,%%v6 \n\t"
"jne 2f \n\t" "jne 2f \n\t"
"VSTEG %%v6,0(%2),0 \n\t" "vsteg %%v6,%1,0 \n\t"
"VMNLG %%v1,%%v5,%%v7 \n\t" "vmnlg %%v1,%%v5,%%v7 \n\t"
"VLGVG %%r2,%%v1,0 \n\t" "vlgvg %0,%%v1,0 \n\t"
"br %%r14 \n\t" "j 3f \n\t"
"2: \n\t" "2: \n\t"
"wfchdb %%v16,%%v6 ,%%v26 \n\t" "wfchdb %%v16,%%v6 ,%%v26 \n\t"
"vsel %%v1,%%v5,%%v7,%%v16 \n\t" "vsel %%v1,%%v5,%%v7,%%v16 \n\t"
"vsel %%v0,%%v26,%%v6,%%v16 \n\t" "vsel %%v0,%%v26,%%v6,%%v16 \n\t"
"VLGVG %%r2,%%v1,0 \n\t" "vlgvg %0,%%v1,0 \n\t"
"std %%f0,0(%2) \n\t" "std %%f0,%1 \n\t"
"3: \n\t"
: : "+r"(index) ,"=m"(*minf), "+&a"(x)
: "r"(n), "a"(x), "a"(minf) : "r"(n), "2"(x)
: "cc", "memory","r0","r1","r2","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", : "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
); );
return index;
} }
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@ -220,6 +231,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
i = n1; i = n1;
} }
else {
//assign minf
minf = CABS1(x,0);
ix += 2;
i++;
}
while(i < n) while(i < n)
{ {

View File

@ -40,12 +40,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
static FLOAT __attribute__ ((noinline)) zasum_kernel_16(BLASLONG n, FLOAT *x) { static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT asum;
__asm__ ( __asm__ (
"pfd 1, 0(%1) \n\t" "pfd 1, 0(%3) \n\t"
"sllg %%r0,%0,4 \n\t" "sllg %%r0,%2,4 \n\t"
"agr %%r0,%1 \n\t" "agr %%r0,%3 \n\t"
"vzero %%v0 \n\t" "vzero %%v0 \n\t"
"vzero %%v1 \n\t" "vzero %%v1 \n\t"
"vzero %%v22 \n\t" "vzero %%v22 \n\t"
@ -99,10 +100,12 @@ static FLOAT __attribute__ ((noinline)) zasum_kernel_16(BLASLONG n, FLOAT *x) {
"vfadb %%v0,%%v25,%%v24 \n\t" "vfadb %%v0,%%v25,%%v24 \n\t"
"vrepg %%v1,%%v0,1 \n\t" "vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t" "adbr %%f0,%%f1 \n\t"
: "ldr %0 ,%%f0"
: "r"(n), "a"(x) : "=f"(asum),"+&a"(x)
: "cc", "memory","r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" : "r"(n), "1"(x)
: "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
); );
return asum;
} }

View File

@ -27,30 +27,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(Z13)
static void __attribute__ ((noinline)) zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
__asm__ volatile( __asm__ volatile(
"pfd 1, 0(%1) \n\t"
"pfd 1, 0(%2) \n\t" "pfd 1, 0(%2) \n\t"
"pfd 1, 0(%3) \n\t"
"vzero %%v24 \n\t" "vzero %%v24 \n\t"
"vzero %%v25 \n\t" "vzero %%v25 \n\t"
"vzero %%v26 \n\t" "vzero %%v26 \n\t"
"vzero %%v27 \n\t" "vzero %%v27 \n\t"
"srlg %%r0,%0,3 \n\t" "srlg %1,%1,3 \n\t"
"xgr %%r1,%%r1 \n\t" "xgr %%r1,%%r1 \n\t"
".align 16 \n\t" ".align 16 \n\t"
"1: \n\t" "1: \n\t"
"pfd 1, 256(%%r1,%1) \n\t"
"pfd 1, 256(%%r1,%2) \n\t" "pfd 1, 256(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t" "pfd 1, 256(%%r1,%3) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t" "vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t" "vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t" "vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v28, 0(%%r1,%2) \n\t" "vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v29, 16(%%r1,%2) \n\t" "vl %%v28, 0(%%r1,%3) \n\t"
"vl %%v30, 32(%%r1,%2) \n\t" "vl %%v29, 16(%%r1,%3) \n\t"
"vl %%v31, 48(%%r1,%2) \n\t" "vl %%v30, 32(%%r1,%3) \n\t"
"vl %%v31, 48(%%r1,%3) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t" "vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t" "vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t" "vpdi %%v22,%%v18,%%v18,4 \n\t"
@ -68,14 +69,14 @@ static void __attribute__ ((noinline)) zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT
"vl %%v16, 64(%%r1,%1) \n\t" "vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%1) \n\t" "vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%1) \n\t" "vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19,112(%%r1,%1) \n\t" "vl %%v19,112(%%r1,%2) \n\t"
"vl %%v28, 64(%%r1,%2) \n\t" "vl %%v28, 64(%%r1,%3) \n\t"
"vl %%v29, 80(%%r1,%2) \n\t" "vl %%v29, 80(%%r1,%3) \n\t"
"vl %%v30, 96(%%r1,%2) \n\t" "vl %%v30, 96(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%2) \n\t" "vl %%v31,112(%%r1,%3) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t" "vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t" "vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t" "vpdi %%v22,%%v18,%%v18,4 \n\t"
@ -91,22 +92,24 @@ static void __attribute__ ((noinline)) zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT
"la %%r1,128(%%r1) \n\t" "la %%r1,128(%%r1) \n\t"
"brctg %%r0,1b \n\t" "brctg %1,1b \n\t"
"vfadb %%v24,%%v26,%%v24 \n\t" "vfadb %%v24,%%v26,%%v24 \n\t"
"vfadb %%v25,%%v25,%%v27 \n\t" "vfadb %%v25,%%v25,%%v27 \n\t"
"vsteg %%v24,0(%3),0 \n\t" "vsteg %%v24,0(%4),0 \n\t"
"vsteg %%v24,8(%3),1 \n\t" "vsteg %%v24,8(%4),1 \n\t"
"vsteg %%v25,16(%3),1 \n\t" "vsteg %%v25,16(%4),1 \n\t"
"vsteg %%v25,24(%3),0 \n\t" "vsteg %%v25,24(%4),0 \n\t"
: : "=m"(*d) ,"+&r"(n)
: "r"(n), "a"(x), "a"(y), "a"(d) : "a"(x), "a"(y), "a"(d)
: "cc", "memory","r0","r1","v16", : "cc", "r1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
); );
} }
static __attribute__ ((noinline)) void zdot_kernel_8n(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { #else
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
BLASLONG register i = 0; BLASLONG register i = 0;
FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0}; FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0};
BLASLONG j = 0; BLASLONG j = 0;
@ -144,6 +147,8 @@ static __attribute__ ((noinline)) void zdot_kernel_8n(BLASLONG n, FLOAT *x, FLOA
} }
#endif
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i; BLASLONG i;
BLASLONG ix, iy; BLASLONG ix, iy;