Merge pull request #1426 from quickwritereader/develop
(Z13 ) Blas1 mikrokernels can be inlined by gcc. Refactoring,fixes,tunings
This commit is contained in:
commit
e4c71a799a
|
@ -39,19 +39,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
|
||||
FLOAT asum ;
|
||||
FLOAT asum ;
|
||||
__asm__ (
|
||||
"pfd 1, 0(%3) \n\t"
|
||||
"sllg %%r0,%2,3 \n\t"
|
||||
"agr %%r0,%3 \n\t"
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v2 \n\t"
|
||||
"vzero %%v3 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%1 ) \n\t"
|
||||
"vlm %%v24,%%v31, 0(%1 ) \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_temp] ) \n\t"
|
||||
"vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t"
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
|
@ -71,7 +71,7 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
|
|||
"vfadb %%v2,%%v2,%%v30 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v31 \n\t"
|
||||
|
||||
"vlm %%v24,%%v31, 128(%1) \n\t"
|
||||
"vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t"
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
|
@ -81,7 +81,7 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
|
|||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"la %1,256(%1) \n\t"
|
||||
"la %[ptr_temp],256(%[ptr_temp]) \n\t"
|
||||
"vfadb %%v0,%%v0,%%v24 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v25 \n\t"
|
||||
"vfadb %%v2,%%v2,%%v26 \n\t"
|
||||
|
@ -91,16 +91,16 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
|
|||
"vfadb %%v2,%%v2,%%v30 \n\t"
|
||||
"vfadb %%v3,%%v3,%%v31 \n\t"
|
||||
|
||||
"clgrjl %1,%%r0,1b \n\t"
|
||||
"clgrjl %[ptr_temp],%%r0,1b \n\t"
|
||||
"vfadb %%v24,%%v0,%%v1 \n\t"
|
||||
"vfadb %%v25,%%v2,%%v3 \n\t"
|
||||
"vfadb %%v0,%%v25,%%v24 \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %0,%%f0 \n\t"
|
||||
: "=f"(asum),"+&a"(x)
|
||||
: "r"(n), "1"(x)
|
||||
: "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
"ldr %[asum],%%f0 \n\t"
|
||||
: [asum] "=f"(asum),[ptr_temp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x)
|
||||
: "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return asum;
|
||||
|
||||
|
|
|
@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
#include "common.h"
|
||||
#define Z13_D 1
|
||||
|
||||
#define PREFETCH_INS 1
|
||||
#if defined(Z13_A)
|
||||
#include <vecintrin.h>
|
||||
|
||||
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
__vector double v_a = {*alpha,*alpha};
|
||||
__vector double v_a = {alpha,alpha};
|
||||
__vector double * v_y=(__vector double *)y;
|
||||
__vector double * v_x=(__vector double *)x;
|
||||
|
||||
|
@ -60,256 +60,53 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
}
|
||||
|
||||
}
|
||||
#elif defined(Z13_B)
|
||||
static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
|
||||
__asm__ volatile(
|
||||
#if defined(PREFETCH_INS)
|
||||
"pfd 1, 0(%1) \n\t"
|
||||
"pfd 2, 0(%2) \n\t"
|
||||
#endif
|
||||
"vlrepg %%v0 , 0(%3) \n\t"
|
||||
"srlg %3,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"vlr %%v1,%%v0 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
#if defined(PREFETCH_INS)
|
||||
"pfd 1, 256(%%r1,%1) \n\t"
|
||||
"pfd 2, 256(%%r1,%2) \n\t"
|
||||
#endif
|
||||
|
||||
"vl %%v24, 0(%%r1,%2) \n\t"
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
|
||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v25, 16(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
|
||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v26, 32(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
|
||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v27, 48(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
|
||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,( 0+64)(%%r1,%2) \n\t"
|
||||
"vl %%v16,( 0+64)(%%r1,%1) \n\t"
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
|
||||
"vst %%v16,( 0+64)(%%r1,%2) \n\t"
|
||||
"vl %%v25, (16+64)(%%r1,%2) \n\t"
|
||||
"vl %%v17, (16+64)(%%r1,%1) \n\t"
|
||||
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
|
||||
"vst %%v17, (16+64)(%%r1,%2) \n\t"
|
||||
"vl %%v26, (32+64)(%%r1,%2) \n\t"
|
||||
"vl %%v18, (32+64)(%%r1,%1) \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
|
||||
"vst %%v18, (32+64)(%%r1,%2) \n\t"
|
||||
"vl %%v27, (48+64)(%%r1,%2) \n\t"
|
||||
"vl %%v19, (48+64)(%%r1,%1) \n\t"
|
||||
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
|
||||
"vst %%v19, (48+64)(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,( 0+128)(%%r1,%2) \n\t"
|
||||
"vl %%v16,( 0+128)(%%r1,%1) \n\t"
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
|
||||
"vst %%v16,( 0+128)(%%r1,%2) \n\t"
|
||||
"vl %%v25, (16+128)(%%r1,%2) \n\t"
|
||||
"vl %%v17, (16+128)(%%r1,%1) \n\t"
|
||||
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
|
||||
"vst %%v17, (16+128)(%%r1,%2) \n\t"
|
||||
"vl %%v26, (32+128)(%%r1,%2) \n\t"
|
||||
"vl %%v18, (32+128)(%%r1,%1) \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
|
||||
"vst %%v18, (32+128)(%%r1,%2) \n\t"
|
||||
"vl %%v27, (48+128)(%%r1,%2) \n\t"
|
||||
"vl %%v19, (48+128)(%%r1,%1) \n\t"
|
||||
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
|
||||
"vst %%v19, (48+128)(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24,( 0+192)(%%r1,%2) \n\t"
|
||||
"vl %%v16,( 0+192)(%%r1,%1) \n\t"
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
|
||||
"vst %%v16,( 0+192)(%%r1,%2) \n\t"
|
||||
"vl %%v25, (16+192)(%%r1,%2) \n\t"
|
||||
"vl %%v17, (16+192)(%%r1,%1) \n\t"
|
||||
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
|
||||
"vst %%v17, (16+192)(%%r1,%2) \n\t"
|
||||
"vl %%v26, (32+192)(%%r1,%2) \n\t"
|
||||
"vl %%v18, (32+192)(%%r1,%1) \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
|
||||
"vst %%v18, (32+192)(%%r1,%2) \n\t"
|
||||
"vl %%v27, (48+192)(%%r1,%2) \n\t"
|
||||
"vl %%v19, (48+192)(%%r1,%1) \n\t"
|
||||
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
|
||||
"vst %%v19, (48+192)(%%r1,%2) \n\t"
|
||||
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %3,1b"
|
||||
:
|
||||
:"r"(n),"a"(x),"a"(y),"a"(alpha)
|
||||
:"cc", "memory", "r1" ,"v0" ,"v16","v17","v18","v19", "v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
#elif defined(Z13_C)
|
||||
static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
__asm__ volatile(
|
||||
#if defined(PREFETCH_INS)
|
||||
"pfd 1, 0(%1) \n\t"
|
||||
"pfd 2, 0(%2) \n\t"
|
||||
#endif
|
||||
"vlrepg %%v0 , 0(%3) \n\t"
|
||||
"srlg %3,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"vlr %%v1,%%v0 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
#if defined(PREFETCH_INS)
|
||||
"pfd 1, 256(%%r1,%1) \n\t"
|
||||
"pfd 2, 256(%%r1,%2) \n\t"
|
||||
#endif
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%2) \n\t"
|
||||
"vl %%v25, 16(%%r1,%2) \n\t"
|
||||
"vl %%v26, 32(%%r1,%2) \n\t"
|
||||
"vl %%v27, 48(%%r1,%2) \n\t"
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
|
||||
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
|
||||
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
|
||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
||||
|
||||
|
||||
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
|
||||
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
|
||||
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
|
||||
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
|
||||
|
||||
"vst %%v24, 64(%%r1,%2) \n\t"
|
||||
"vst %%v25, 80(%%r1,%2) \n\t"
|
||||
"vst %%v26, 96(%%r1,%2) \n\t"
|
||||
"vst %%v27, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, (0+128)(%%r1,%1) \n\t"
|
||||
"vl %%v17, (16+128)(%%r1,%1) \n\t"
|
||||
"vl %%v18, (32+128)(%%r1,%1) \n\t"
|
||||
"vl %%v19, (48+128)(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v24, (0+128)(%%r1,%2) \n\t"
|
||||
"vl %%v25, (16+128)(%%r1,%2) \n\t"
|
||||
"vl %%v26, (32+128)(%%r1,%2) \n\t"
|
||||
"vl %%v27, (48+128)(%%r1,%2) \n\t"
|
||||
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
|
||||
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
|
||||
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
|
||||
"vst %%v16, (0+128)(%%r1,%2) \n\t"
|
||||
"vst %%v17, (16+128)(%%r1,%2) \n\t"
|
||||
"vst %%v18, (32+128)(%%r1,%2) \n\t"
|
||||
"vst %%v19, (48+128)(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, (64+128)(%%r1,%1) \n\t"
|
||||
"vl %%v25, (80+128)(%%r1,%1) \n\t"
|
||||
"vl %%v26, (96+128)(%%r1,%1) \n\t"
|
||||
"vl %%v27, (112+128)(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v16, (64+128)(%%r1,%2) \n\t"
|
||||
"vl %%v17, (80+128)(%%r1,%2) \n\t"
|
||||
"vl %%v18, (96+128)(%%r1,%2) \n\t"
|
||||
"vl %%v19, (112+128)(%%r1,%2) \n\t"
|
||||
|
||||
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
|
||||
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
|
||||
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
|
||||
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
|
||||
|
||||
"vst %%v24, (64+128)(%%r1,%2) \n\t"
|
||||
"vst %%v25, (80+128)(%%r1,%2) \n\t"
|
||||
"vst %%v26, (96+128)(%%r1,%2) \n\t"
|
||||
"vst %%v27, (112+128)(%%r1,%2) \n\t"
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %3,1b"
|
||||
:
|
||||
:"r"(n),"a"(x),"a"(y),"a"(alpha)
|
||||
:"cc", "memory", "r1" ,"v0","v1","v16","v17","v18","v19", "v24","v25","v26","v27"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#elif defined(Z13_D)
|
||||
static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
#else
|
||||
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
|
||||
{
|
||||
|
||||
__asm__ volatile(
|
||||
#if defined(PREFETCH_INS)
|
||||
"pfd 1, 0(%1) \n\t"
|
||||
"pfd 2, 0(%2) \n\t"
|
||||
"pfd 1, 0(%[x_tmp]) \n\t"
|
||||
"pfd 2, 0(%[y_tmp]) \n\t"
|
||||
#endif
|
||||
"vlrepg %%v0 , 0(%3) \n\t"
|
||||
"srlg %3,%0,5 \n\t"
|
||||
"vlr %%v1,%%v0 \n\t"
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgp %%v0,%%r0,%%r0 \n\t"
|
||||
"srlg %%r0,%[n],5 \n\t"
|
||||
"vlr %%v1,%%v0 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
#if defined(PREFETCH_INS)
|
||||
"pfd 1, 256(%1) \n\t"
|
||||
"pfd 2, 256(%2) \n\t"
|
||||
"pfd 1, 256(%[x_tmp]) \n\t"
|
||||
"pfd 2, 256(%[y_tmp]) \n\t"
|
||||
#endif
|
||||
"vlm %%v16,%%v23, 0(%1) \n\t"
|
||||
"vlm %%v24, %%v31, 0(%2) \n\t"
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
|
||||
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
|
||||
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
|
||||
"vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
|
||||
"vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
|
||||
"vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
|
||||
"vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
|
||||
"vstm %%v16,%%v23, 0(%2) \n\t"
|
||||
"vlm %%v24,%%v31, 128(%1) \n\t"
|
||||
"vlm %%v16,%%v23, 128(%2) \n\t"
|
||||
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
|
||||
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
|
||||
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
|
||||
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
|
||||
"vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
|
||||
"vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
|
||||
"vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
|
||||
"vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
|
||||
"la %1,256(%1) \n\t"
|
||||
"vstm %%v24, %%v31, 128(%2) \n\t"
|
||||
"la %2,256(%2) \n\t"
|
||||
"brctg %3,1b"
|
||||
:
|
||||
:"r"(n),"a"(x),"a"(y),"a"(alpha)
|
||||
:"cc", "memory", "v0","v1","v16","v17","v18","v19","v20","v21",
|
||||
"vlm %%v16,%%v23, 0(%[x_tmp]) \n\t"
|
||||
"vlm %%v24, %%v31, 0(%[y_tmp]) \n\t"
|
||||
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
|
||||
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
|
||||
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
|
||||
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
|
||||
"vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
|
||||
"vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
|
||||
"vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
|
||||
"vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
|
||||
"vstm %%v16,%%v23, 0(%[y_tmp]) \n\t"
|
||||
"vlm %%v24,%%v31, 128(%[x_tmp]) \n\t"
|
||||
"vlm %%v16,%%v23, 128(%[y_tmp]) \n\t"
|
||||
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
|
||||
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
|
||||
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
|
||||
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
|
||||
"vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
|
||||
"vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
|
||||
"vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
|
||||
"vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
|
||||
"la %[x_tmp],256(%[x_tmp]) \n\t"
|
||||
"vstm %%v24, %%v31, 128(%[y_tmp]) \n\t"
|
||||
"la %[y_tmp],256(%[y_tmp]) \n\t"
|
||||
"brctg %%r0,1b"
|
||||
: [mem_y] "+m" (*(double (*)[n])y), [x_tmp] "+&a"(x), [y_tmp] "+&a"(y)
|
||||
: [mem_x] "m" (*(const double (*)[n])x), [n] "r"(n), [alpha] "f"(alpha)
|
||||
:"cc", "r0", "v0","v1","v16","v17","v18","v19","v20","v21",
|
||||
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
|
@ -334,7 +131,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
BLASLONG n1 = n & -32;
|
||||
|
||||
if ( n1 )
|
||||
daxpy_kernel_32(n1, x, y , &da );
|
||||
daxpy_kernel_32(n1, x, y , da );
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
|
|
|
@ -30,83 +30,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(Z13mvc)
|
||||
|
||||
static void __attribute__ ((noinline)) dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%1) \n\t"
|
||||
"pfd 2, 0(%2) \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"mvc 0(256,%2),0(%1) \n\t"
|
||||
"la %1,256(%1) \n\t"
|
||||
"la %2,256(%2) \n\t"
|
||||
"brctg %%r0,1b"
|
||||
:
|
||||
: "r"(n), "a"(x), "a"(y)
|
||||
: "cc", "memory","r0"
|
||||
"mvc 0(256,%[ptr_y]),0(%[ptr_x]) \n\t"
|
||||
"la %[ptr_x],256(%[ptr_x]) \n\t"
|
||||
"la %[ptr_y],256(%[ptr_y]) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n)
|
||||
: [mem_x] "m" (*(const double (*)[n])x),
|
||||
[ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc"
|
||||
);
|
||||
return;
|
||||
|
||||
}
|
||||
#else
|
||||
|
||||
static void __attribute__ ((noinline)) dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%1) \n\t"
|
||||
"pfd 2, 0(%2) \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%1) \n\t"
|
||||
"pfd 2, 256(%%r1,%2) \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vst %%v24, 0(%%r1,%2) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vst %%v25, 16(%%r1,%2) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vst %%v26, 32(%%r1,%2) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vst %%v27, 48(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
||||
"vst %%v24, 64(%%r1,%2) \n\t"
|
||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
||||
"vst %%v25, 80(%%r1,%2) \n\t"
|
||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
||||
"vst %%v26, 96(%%r1,%2) \n\t"
|
||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
||||
"vst %%v27, 112(%%r1,%2) \n\t"
|
||||
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
||||
"vst %%v24, 192(%%r1,%2) \n\t"
|
||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
||||
"vst %%v25, 208(%%r1,%2) \n\t"
|
||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
||||
"vst %%v26, 224(%%r1,%2) \n\t"
|
||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
||||
"vst %%v27, 240(%%r1,%2) \n\t"
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %%r0,1b"
|
||||
:
|
||||
: "r"(n), "a"(x), "a"(y)
|
||||
: "cc", "memory","r0","r1", "v24","v25","v26","v27"
|
||||
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n)
|
||||
: [mem_x] "m" (*(const double (*)[n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v24","v25","v26","v27"
|
||||
);
|
||||
return;
|
||||
|
||||
|
|
|
@ -30,65 +30,67 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
#if defined(Z13)
|
||||
static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
FLOAT dot;
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%2) \n\t"
|
||||
"pfd 1, 0(%3) \n\t"
|
||||
"pfd 1, 0(%[ptr_x_tmp]) \n\t"
|
||||
"pfd 1, 0(%[ptr_y_tmp]) \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %1,%1,4 \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%2) \n\t"
|
||||
"pfd 1, 256(%%r1,%3) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
|
||||
"vl %%v28, 0(%%r1,%3) \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vl %%v29, 16(%%r1,%3) \n\t"
|
||||
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
|
||||
"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
|
||||
|
||||
"vl %%v30, 32(%%r1,%3) \n\t"
|
||||
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
|
||||
"vl %%v31, 48(%%r1,%3) \n\t"
|
||||
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
|
||||
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
|
||||
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
|
||||
|
||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
||||
"vl %%v16, 64(%%r1 ,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v19, 112(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
|
||||
"vl %%v28, 64(%%r1,%3) \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vl %%v29, 80(%%r1,%3) \n\t"
|
||||
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
|
||||
"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
|
||||
|
||||
|
||||
"vl %%v30, 96(%%r1,%3) \n\t"
|
||||
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
|
||||
"vl %%v31, 112(%%r1,%3) \n\t"
|
||||
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
|
||||
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
|
||||
"vl %%v31, 112(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
|
||||
|
||||
|
||||
"la %%r1,128(%%r1) \n\t"
|
||||
"brctg %1,1b \n\t"
|
||||
"vfadb %%v24,%%v25,%%v24 \n\t"
|
||||
"vfadb %%v24,%%v26,%%v24 \n\t"
|
||||
"vfadb %%v24,%%v27,%%v24 \n\t"
|
||||
"vrepg %%v1,%%v24,1 \n\t"
|
||||
"vfadb %%v1,%%v24,%%v1 \n\t"
|
||||
"ldr %0, %%f1 \n\t"
|
||||
: "=f"(dot) ,"+&r"(n)
|
||||
: "a"(x),"a"(y)
|
||||
:"cc" , "r1","v16", "v17","v18","v19","v20","v21","v22","v23",
|
||||
"la %%r1,128(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b \n\t"
|
||||
"vfadb %%v24,%%v25,%%v24 \n\t"
|
||||
"vfadb %%v24,%%v26,%%v24 \n\t"
|
||||
"vfadb %%v24,%%v27,%%v24 \n\t"
|
||||
"vrepg %%v1,%%v24,1 \n\t"
|
||||
"vfadb %%v1,%%v24,%%v1 \n\t"
|
||||
"ldr %[dot], %%f1 \n\t"
|
||||
: [dot] "=f"(dot) ,[n_tmp] "+&r"(n)
|
||||
: [mem_x] "m"( *(const double (*)[n])x),
|
||||
[mem_y] "m"( *(const double (*)[n])y),
|
||||
[ptr_x_tmp]"a"(x), [ptr_y_tmp] "a"(y)
|
||||
:"cc" , "r1","f1","v16", "v17","v18","v19","v20","v21","v22","v23",
|
||||
"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
|
||||
);
|
||||
|
@ -99,14 +101,14 @@ static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
|
||||
#else
|
||||
|
||||
static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y )
|
||||
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y )
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT dot = 0.0;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
dot += y[i] * x[i]
|
||||
dot += y[i] * x[i]
|
||||
+ y[i+1] * x[i+1]
|
||||
+ y[i+2] * x[i+2]
|
||||
+ y[i+3] * x[i+3]
|
||||
|
@ -114,8 +116,17 @@ static FLOAT ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y )
|
|||
+ y[i+5] * x[i+5]
|
||||
+ y[i+6] * x[i+6]
|
||||
+ y[i+7] * x[i+7] ;
|
||||
dot += y[i+8] * x[i+8]
|
||||
+ y[i+9] * x[i+9]
|
||||
+ y[i+10] * x[i+10]
|
||||
+ y[i+11] * x[i+11]
|
||||
+ y[i+12] * x[i+12]
|
||||
+ y[i+13] * x[i+13]
|
||||
+ y[i+14] * x[i+14]
|
||||
+ y[i+15] * x[i+15] ;
|
||||
|
||||
i+=8 ;
|
||||
|
||||
i+=16 ;
|
||||
|
||||
}
|
||||
return dot;
|
||||
|
@ -138,10 +149,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if ( n1 )
|
||||
dot = ddot_kernel_8(n1, x, y );
|
||||
if ( n1 ){
|
||||
dot = ddot_kernel_16(n1, x, y );
|
||||
i = n1;
|
||||
}
|
||||
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
|
|
|
@ -25,43 +25,40 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
|
||||
|
||||
static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
|
||||
{
|
||||
__asm__ (
|
||||
"pfd 2, 0(%1) \n\t"
|
||||
"pfd 2, 0(%2) \n\t"
|
||||
|
||||
"vlrepg %%v0,0(%3) \n\t"
|
||||
"vlrepg %%v1,0(%4) \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"pfd 2, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"lgdr %%r1,%[cos] \n\t"
|
||||
"vlvgp %%v0,%%r1,%%r1 \n\t"
|
||||
"lgdr %%r1,%[sin] \n\t"
|
||||
"vlvgp %%v1,%%r1,%%r1 \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%1) \n\t"
|
||||
"pfd 2, 256(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
|
@ -72,34 +69,32 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO
|
|||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
||||
"vl %%v19, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
|
@ -110,34 +105,32 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO
|
|||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
|
@ -148,34 +141,32 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO
|
|||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
|
@ -186,33 +177,28 @@ static void __attribute__ ((noinline)) drot_kernel_32(BLASLONG n, FLOAT *x, FLO
|
|||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %%r0,1b"
|
||||
:
|
||||
: "r"(n), "a"(x), "a"(y),"a"(c),"a"(s)
|
||||
: "cc", "memory","r0","r1" ,"v0","v1","v16",
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[n])x),
|
||||
[mem_y] "+m" (*(double (*)[n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
|
||||
: "cc", "r1" ,"v0","v1","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
|
@ -228,10 +214,8 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
FLOAT cosa,sina;
|
||||
cosa=c;
|
||||
sina=s;
|
||||
drot_kernel_32(n1, x, y, &cosa, &sina);
|
||||
|
||||
drot_kernel_32(n1, x, y, c, s);
|
||||
i=n1;
|
||||
}
|
||||
|
||||
|
@ -245,7 +229,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -267,4 +250,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -28,78 +28,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(Z13)
|
||||
static void __attribute__ ((noinline)) dscal_kernel_8( BLASLONG n, FLOAT da , FLOAT *x )
|
||||
|
||||
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
|
||||
{
|
||||
|
||||
__asm__ ("pfd 2, 0(%1) \n\t"
|
||||
"vrepg %%v0 , %%v0,0 \n\t"
|
||||
"sllg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%1) \n\t"
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v0 \n\t"
|
||||
"vst %%v24, 0(%%r1,%1) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v0 \n\t"
|
||||
"vst %%v25, 16(%%r1,%1) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v0 \n\t"
|
||||
"vst %%v26, 32(%%r1,%1) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v0 \n\t"
|
||||
"vst %%v27, 48(%%r1,%1) \n\t"
|
||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v0 \n\t"
|
||||
"vst %%v24, 64(%%r1,%1) \n\t"
|
||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v0 \n\t"
|
||||
"vst %%v25, 80(%%r1,%1) \n\t"
|
||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v0 \n\t"
|
||||
"vst %%v26, 96(%%r1,%1) \n\t"
|
||||
"vl %%v27, 112(%%r1,%1) \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v0 \n\t"
|
||||
"vst %%v27, 112(%%r1,%1) \n\t"
|
||||
"la %%r1,128(%%r1) \n\t"
|
||||
"clgrjl %%r1,%%r0,1b \n\t"
|
||||
:
|
||||
:"r"(n),"a"(x),"f"(da)
|
||||
:"cc" , "memory" ,"r0","r1","v0","v24","v25","v26","v27"
|
||||
/* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */
|
||||
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgp %%v0,%%r0,%%r0 \n\t"
|
||||
"vlr %%v1,%%v0 \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%[x_ptr]) \n\t"
|
||||
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v16,%%v16,%%v0 \n\t"
|
||||
"vfmdb %%v17,%%v17,%%v1 \n\t"
|
||||
"vfmdb %%v18,%%v18,%%v0 \n\t"
|
||||
"vfmdb %%v19,%%v19,%%v1 \n\t"
|
||||
"vfmdb %%v20,%%v20,%%v0 \n\t"
|
||||
"vfmdb %%v21,%%v21,%%v1 \n\t"
|
||||
"vfmdb %%v22,%%v22,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v23,%%v1 \n\t"
|
||||
"vstm %%v16,%%v23, 0(%[x_ptr]) \n\t"
|
||||
"vlm %%v24,%%v31,128(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v1 \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v1 \n\t"
|
||||
"vfmdb %%v28,%%v28,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v29,%%v1 \n\t"
|
||||
"vfmdb %%v30,%%v30,%%v0 \n\t"
|
||||
"vfmdb %%v31,%%v31,%%v1 \n\t"
|
||||
"vstm %%v24,%%v31,128(%[x_ptr]) \n\t"
|
||||
"la %[x_ptr], 256(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n),[alpha] "f"(da)
|
||||
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
|
||||
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) dscal_kernel_8_zero( BLASLONG n, FLOAT da , FLOAT *x )
|
||||
static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x )
|
||||
{
|
||||
|
||||
__asm__ ("pfd 2, 0(%1) \n\t"
|
||||
"vzero %%v0 \n\t"
|
||||
"sllg %%r0,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%1) \n\t"
|
||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
||||
"vst %%v0, 16(%%r1,%1) \n\t"
|
||||
"vst %%v0, 32(%%r1,%1) \n\t"
|
||||
"vst %%v0, 48(%%r1,%1) \n\t"
|
||||
"vst %%v0, 64(%%r1,%1) \n\t"
|
||||
"vst %%v0, 80(%%r1,%1) \n\t"
|
||||
"vst %%v0, 96(%%r1,%1) \n\t"
|
||||
"vst %%v0, 112(%%r1,%1) \n\t"
|
||||
"la %%r1,128(%%r1) \n\t"
|
||||
"clgrjl %%r1,%%r0,1b \n\t"
|
||||
:
|
||||
:"r"(n),"a"(x),"f"(da)
|
||||
:"cc" , "memory" ,"r0","r1","v0"
|
||||
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 0(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 16(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 32(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 48(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 64(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 80(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 96(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 112(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 128(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 144(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 160(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 176(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 192(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 208(%[x_ptr]) \n\t"
|
||||
"vst %%v24, 224(%[x_ptr]) \n\t"
|
||||
"vst %%v25, 240(%[x_ptr]) \n\t"
|
||||
"la %[x_ptr],256(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n)
|
||||
:"cc" , "r0", "v24" ,"v25"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
|
@ -114,11 +123,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
||||
dscal_kernel_8_zero(n1 , da , x);
|
||||
dscal_kernel_32_zero(n1 , x);
|
||||
j=n1;
|
||||
}
|
||||
|
||||
|
@ -133,10 +142,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
else
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dscal_kernel_8(n1 , da , x);
|
||||
dscal_kernel_32(n1 , da , x);
|
||||
j=n1;
|
||||
}
|
||||
while(j < n)
|
||||
|
|
|
@ -29,299 +29,205 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#define Z13_SWAP_C 1
|
||||
|
||||
|
||||
#if defined(Z13_SWAP_A)
|
||||
static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%1) \n\t"
|
||||
"pfd 2, 0(%2) \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%1) \n\t"
|
||||
"pfd 2, 256(%%r1,%2) \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vst %%v24, 0(%%r1,%2) \n\t"
|
||||
"vst %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vst %%v25, 16(%%r1,%2) \n\t"
|
||||
"vst %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vst %%v26, 32(%%r1,%2) \n\t"
|
||||
"vst %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"vst %%v27, 48(%%r1,%2) \n\t"
|
||||
"vst %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v28, 64(%%r1,%1) \n\t"
|
||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v28, 64(%%r1,%2) \n\t"
|
||||
"vst %%v20, 64(%%r1,%1) \n\t"
|
||||
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v29, 80(%%r1,%1) \n\t"
|
||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v29, 80(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%1) \n\t"
|
||||
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v30, 96(%%r1,%1) \n\t"
|
||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v30, 96(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%1) \n\t"
|
||||
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v31, 112(%%r1,%1) \n\t"
|
||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
||||
"vst %%v31, 112(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%1) \n\t"
|
||||
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
||||
"vst %%v16, 128(%%r1,%1) \n\t"
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
||||
"vst %%v17, 144(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
||||
"vst %%v18, 160(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
||||
"vst %%v19, 176(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
||||
"vst %%v20, 192(%%r1,%1) \n\t"
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
||||
"vst %%v21, 208(%%r1,%1) \n\t"
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
||||
"vst %%v22, 224(%%r1,%1) \n\t"
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
||||
"vst %%v23, 240(%%r1,%1) \n\t"
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %%r0,1b"
|
||||
:
|
||||
: "r"(n), "a"(x), "a"(y)
|
||||
: "cc", "memory" ,"r0","r1", "v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[n])x),
|
||||
[mem_y] "+m" (*(double (*)[n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
,"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
#elif defined(Z13_SWAP_B)
|
||||
#else
|
||||
|
||||
static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 2, 0(%1) \n\t"
|
||||
"pfd 2, 0(%2) \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"pfd 2, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%1) \n\t"
|
||||
"pfd 2, 256(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vl %%v28, 64(%%r1,%1) \n\t"
|
||||
"vl %%v29, 80(%%r1,%1) \n\t"
|
||||
"vl %%v30, 96(%%r1,%1) \n\t"
|
||||
"vl %%v31, 112(%%r1,%1) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"vl %%v20, 64(%%r1,%2) \n\t"
|
||||
"vl %%v21, 80(%%r1,%2) \n\t"
|
||||
"vl %%v22, 96(%%r1,%2) \n\t"
|
||||
"vl %%v23, 112(%%r1,%2) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vst %%v24, 0(%%r1,%2) \n\t"
|
||||
"vst %%v25, 16(%%r1,%2) \n\t"
|
||||
"vst %%v26, 32(%%r1,%2) \n\t"
|
||||
"vst %%v27, 48(%%r1,%2) \n\t"
|
||||
"vst %%v28, 64(%%r1,%2) \n\t"
|
||||
"vst %%v29, 80(%%r1,%2) \n\t"
|
||||
"vst %%v30, 96(%%r1,%2) \n\t"
|
||||
"vst %%v31, 112(%%r1,%2)\n\t"
|
||||
"vst %%v16, 0(%%r1,%1) \n\t"
|
||||
"vst %%v17, 16(%%r1,%1) \n\t"
|
||||
"vst %%v18, 32(%%r1,%1) \n\t"
|
||||
"vst %%v19, 48(%%r1,%1) \n\t"
|
||||
"vst %%v20, 64(%%r1,%1) \n\t"
|
||||
"vst %%v21, 80(%%r1,%1) \n\t"
|
||||
"vst %%v22, 96(%%r1,%1) \n\t"
|
||||
"vst %%v23, 112(%%r1,%1)\n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
|
||||
"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
||||
"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
"vl %%v20, 192(%%r1,%2) \n\t"
|
||||
"vl %%v21, 208(%%r1,%2) \n\t"
|
||||
"vl %%v22, 224(%%r1,%2) \n\t"
|
||||
"vl %%v23, 240(%%r1,%2) \n\t"
|
||||
"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
||||
|
||||
|
||||
"vst %%v16, 128(%%r1,%1) \n\t"
|
||||
"vst %%v17, 144(%%r1,%1) \n\t"
|
||||
"vst %%v18, 160(%%r1,%1) \n\t"
|
||||
"vst %%v19, 176(%%r1,%1) \n\t"
|
||||
"vst %%v20, 192(%%r1,%1) \n\t"
|
||||
"vst %%v21, 208(%%r1,%1) \n\t"
|
||||
"vst %%v22, 224(%%r1,%1) \n\t"
|
||||
"vst %%v23, 240(%%r1,%1) \n\t"
|
||||
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %%r0,1b"
|
||||
:
|
||||
: "r"(n), "a"(x), "a"(y)
|
||||
: "cc", "memory","r0","r1", "v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
|
||||
);
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
#elif defined(Z13_SWAP_C)
|
||||
static void __attribute__ ((noinline)) dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 2, 0(%1) \n\t"
|
||||
"pfd 2, 0(%2) \n\t"
|
||||
"srlg %%r0,%0,5 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%1) \n\t"
|
||||
"pfd 2, 256(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
||||
|
||||
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
||||
|
||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
||||
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %%r0,1b"
|
||||
:
|
||||
: "r"(n), "a"(x), "a"(y)
|
||||
: "cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[n])x),
|
||||
[mem_y] "+m" (*(double (*)[n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "memory","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
|
|
@ -43,15 +43,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
* Warning: requirements n>0 and n % 32 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param minf (out) maximum absolute value .( only for output )
|
||||
* @param maxf (out) maximum absolute value .( only for output )
|
||||
* @return index
|
||||
*/
|
||||
static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||
BLASLONG index;
|
||||
__asm__(
|
||||
"pfd 1, 0(%4) \n\t"
|
||||
"sllg %%r0,%3,3 \n\t"
|
||||
"agr %%r0,%4 \n\t"
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vleig %%v20,0,0 \n\t"
|
||||
"vleig %%v20,1,1 \n\t"
|
||||
"vleig %%v21,2,0 \n\t"
|
||||
|
@ -61,13 +61,13 @@ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
"vleig %%v23,6,0 \n\t"
|
||||
"vleig %%v23,7,1 \n\t"
|
||||
"vrepig %%v4,8 \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
"vzero %%v18 \n\t"
|
||||
"vzero %%v19 \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
"vzero %%v18 \n\t"
|
||||
"vzero %%v19 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%2 ) \n\t"
|
||||
"vlm %%v24,%%v31, 0(%2 ) \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
|
@ -89,24 +89,24 @@ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
|
||||
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"
|
||||
|
||||
"vfchdb %%v28, %%v3,%%v0 \n\t"
|
||||
"vfchdb %%v29,%%v27, %%v25 \n\t"
|
||||
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
|
||||
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
|
||||
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
|
||||
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
"vfchdb %%v16,%%v25 , %%v0 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
"vfchdb %%v17, %%v29,%%v18 \n\t"
|
||||
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
|
||||
"vfchdb %%v28, %%v3,%%v0 \n\t"
|
||||
"vfchdb %%v29,%%v27, %%v25 \n\t"
|
||||
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
|
||||
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
|
||||
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
|
||||
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
"vfchdb %%v16,%%v25 , %%v0 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
"vfchdb %%v17, %%v29,%%v18 \n\t"
|
||||
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
|
||||
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vlm %%v24,%%v31,128(%2 ) \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
|
@ -134,37 +134,38 @@ static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
|
||||
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
|
||||
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"la %2,256(%2) \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
"vfchdb %%v16,%%v25 , %%v0 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
"vfchdb %%v17, %%v29,%%v18 \n\t"
|
||||
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
|
||||
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"clgrjl %2,%%r0,1b \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
|
||||
"vrepg %%v26,%%v18,1 \n\t"
|
||||
"vrepg %%v5,%%v19,1 \n\t"
|
||||
"wfcdb %%v26,%%v18 \n\t"
|
||||
"vrepg %%v26,%%v18,1 \n\t"
|
||||
"vrepg %%v5,%%v19,1 \n\t"
|
||||
"wfcdb %%v26,%%v18 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v18,%1,0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v19 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"j 3f \n\t"
|
||||
"2: \n\t"
|
||||
"vsteg %%v18,%[maxf],0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v19 \n\t"
|
||||
"j 3f \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
"wfchdb %%v16,%%v26,%%v18 \n\t"
|
||||
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"3: "
|
||||
: "=r"(index) ,"=m"(*maxf) , "+&a"(x)
|
||||
: "r"(n), "2"(x)
|
||||
"std %%f0,%[maxf] \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
|
||||
: "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
|
|
@ -48,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||
BLASLONG index;
|
||||
__asm__(
|
||||
"pfd 1, 0(%4) \n\t"
|
||||
"sllg %%r0,%3,3 \n\t"
|
||||
"agr %%r0,%4 \n\t"
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"sllg %%r0,%[n],3 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vleig %%v20,0,0 \n\t"
|
||||
"vleig %%v20,1,1 \n\t"
|
||||
"vleig %%v21,2,0 \n\t"
|
||||
|
@ -60,14 +60,14 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
|||
"vleig %%v23,6,0 \n\t"
|
||||
"vleig %%v23,7,1 \n\t"
|
||||
"vrepig %%v4,8 \n\t"
|
||||
"vlrepg %%v18,0(%4) \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vzero %%v19 \n\t"
|
||||
"vlrepg %%v18,0(%[ptr_x]) \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
"vflpdb %%v18, %%v18 \n\t"
|
||||
"vzero %%v19 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%2 ) \n\t"
|
||||
"vlm %%v24,%%v31, 0(%2 ) \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
|
@ -99,22 +99,22 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
|||
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
|
||||
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
|
||||
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
|
||||
"vfchdb %%v16, %%v0,%%v25 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vfchdb %%v16, %%v0,%%v25 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
|
||||
|
||||
"vfchdb %%v17,%%v18, %%v29 \n\t"
|
||||
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
|
||||
"vfchdb %%v17,%%v18, %%v29 \n\t"
|
||||
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
|
||||
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
|
||||
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
|
||||
"vlm %%v24,%%v31,128(%2 ) \n\t"
|
||||
"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
|
@ -147,7 +147,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
|||
|
||||
"vag %%v1,%%v1,%%v5 \n\t"
|
||||
"vag %%v24,%%v24,%%v5 \n\t"
|
||||
"la %2,256(%2) \n\t"
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
"vag %%v24,%%v24,%%v4 \n\t"
|
||||
|
||||
"vfchdb %%v16, %%v0,%%v25 \n\t"
|
||||
|
@ -161,27 +161,28 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
|||
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
|
||||
"clgrjl %2,%%r0,1b \n\t"
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
|
||||
|
||||
"vrepg %%v26,%%v18,1 \n\t"
|
||||
"vrepg %%v5,%%v19,1 \n\t"
|
||||
"wfcdb %%v26,%%v18 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v18,%1,0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v19 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"j 3f \n\t"
|
||||
"2: \n\t"
|
||||
"vsteg %%v18,%[minf],0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v19 \n\t"
|
||||
"j 3f \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
"wfchdb %%v16,%%v18 ,%%v26 \n\t "
|
||||
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"3:"
|
||||
"std %%f0,%[minf] \n\t"
|
||||
|
||||
: "+r"(index) ,"=m"(*minf),"+&a"(x)
|
||||
: "r"(n), "2"(x)
|
||||
"3: \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
|
||||
: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
|
||||
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
|
||||
|
|
|
@ -37,16 +37,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
/**
|
||||
* Find maximum index
|
||||
* Warning: requirements n>0 and n % 8 == 0
|
||||
* Warning: requirements n>0 and n % 16 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param minf (out) maximum absolute value .( only for output )
|
||||
* @param maxf (out) maximum absolute value .( only for output )
|
||||
* @return index
|
||||
*/
|
||||
static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||
static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||
BLASLONG index;
|
||||
__asm__(
|
||||
"pfd 1, 0(%4) \n\t"
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"vleig %%v16,0,0 \n\t"
|
||||
"vleig %%v16,1,1 \n\t"
|
||||
"vleig %%v17,2,0 \n\t"
|
||||
|
@ -65,32 +65,32 @@ static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
"vleig %%v23,15,1 \n\t"
|
||||
|
||||
|
||||
"sllg %%r0,%3,4 \n\t"
|
||||
"agr %%r0,%4 \n\t"
|
||||
"vzero %%v6 \n\t"
|
||||
"vzero %%v7 \n\t"
|
||||
"vrepig %%v4,16 \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vzero %%v6 \n\t"
|
||||
"vzero %%v7 \n\t"
|
||||
"vrepig %%v4,16 \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%2 ) \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
|
||||
"vleg %%v24 , 0( %2),0 \n\t"
|
||||
"vleg %%v25 , 8( %2),0 \n\t"
|
||||
"vleg %%v24 , 16( %2),1 \n\t"
|
||||
"vleg %%v25 , 24( %2),1 \n\t"
|
||||
"vleg %%v26 , 32( %2),0 \n\t"
|
||||
"vleg %%v27 , 40( %2),0 \n\t"
|
||||
"vleg %%v26 , 48( %2),1 \n\t"
|
||||
"vleg %%v27 , 56( %2),1 \n\t"
|
||||
"vleg %%v28 , 64( %2),0 \n\t"
|
||||
"vleg %%v29 , 72( %2),0 \n\t"
|
||||
"vleg %%v28 , 80( %2),1 \n\t"
|
||||
"vleg %%v29 , 88( %2),1 \n\t"
|
||||
"vleg %%v30 , 96( %2),0 \n\t"
|
||||
"vleg %%v31 ,104( %2),0 \n\t"
|
||||
"vleg %%v30 ,112( %2),1 \n\t"
|
||||
"vleg %%v31 ,120( %2),1 \n\t"
|
||||
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
|
@ -100,28 +100,28 @@ static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v1,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v2,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v3,%%v30,%%v31 \n\t"
|
||||
"vfadb %%v0,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v1,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v2,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v3,%%v30,%%v31 \n\t"
|
||||
|
||||
|
||||
"vleg %%v24 , 128( %2),0 \n\t"
|
||||
"vleg %%v25 , 136( %2),0 \n\t"
|
||||
"vleg %%v24 , 144( %2),1 \n\t"
|
||||
"vleg %%v25 , 152( %2),1 \n\t"
|
||||
"vleg %%v26 , 160( %2),0 \n\t"
|
||||
"vleg %%v27 , 168( %2),0 \n\t"
|
||||
"vleg %%v26 , 176( %2),1 \n\t"
|
||||
"vleg %%v27 , 184( %2),1 \n\t"
|
||||
"vleg %%v28 , 192( %2),0 \n\t"
|
||||
"vleg %%v29 , 200( %2),0 \n\t"
|
||||
"vleg %%v28 , 208( %2),1 \n\t"
|
||||
"vleg %%v29 , 216( %2),1 \n\t"
|
||||
"vleg %%v30 , 224( %2),0 \n\t"
|
||||
"vleg %%v31 , 232( %2),0 \n\t"
|
||||
"vleg %%v30 , 240( %2),1 \n\t"
|
||||
"vleg %%v31 , 248( %2),1 \n\t"
|
||||
"vleg %%v24 , 128(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v25 , 136(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v24 , 144(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v25 , 152(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v26 , 160(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v27 , 168(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v26 , 176(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v27 , 184(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v28 , 192(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v29 , 200(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v28 , 208(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v29 , 216(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v30 , 224(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v31 , 232(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v30 , 240(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v31 , 248(%[ptr_tmp]),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
|
@ -131,70 +131,70 @@ static BLASLONG ziamax_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
|||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v24,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v26,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v28,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v30,%%v30,%%v31 \n\t"
|
||||
"vfadb %%v24,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v26,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v28,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v30,%%v30,%%v31 \n\t"
|
||||
|
||||
"vfchdb %%v25,%%v1,%%v0 \n\t"
|
||||
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
|
||||
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
|
||||
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
|
||||
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v3,%%v2 \n\t "
|
||||
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
|
||||
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
|
||||
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
|
||||
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v25,%%v26,%%v24 \n\t "
|
||||
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
|
||||
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
|
||||
"vfchdb %%v25,%%v26,%%v24 \n\t"
|
||||
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
|
||||
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v30,%%v28 \n\t "
|
||||
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
|
||||
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
|
||||
"vfchdb %%v27,%%v30,%%v28 \n\t"
|
||||
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
|
||||
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v24, %%v1,%%v31 \n\t"
|
||||
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
|
||||
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
|
||||
"vfchdb %%v24, %%v1,%%v31 \n\t"
|
||||
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
|
||||
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
|
||||
|
||||
"vfchdb %%v30, %%v27,%%v3 \n\t"
|
||||
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
|
||||
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
|
||||
"vfchdb %%v30, %%v27,%%v3 \n\t"
|
||||
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
|
||||
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
|
||||
|
||||
"la %2,256(%2) \n\t"
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
|
||||
"vfchdb %%v0, %%v31,%%v28 \n\t"
|
||||
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
|
||||
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
|
||||
"vfchdb %%v0, %%v31,%%v28 \n\t"
|
||||
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
|
||||
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
|
||||
|
||||
"vag %%v25,%%v25,%%v5 \n\t"
|
||||
"vag %%v25,%%v25,%%v5 \n\t"
|
||||
|
||||
//cmp with previous
|
||||
"vfchdb %%v30, %%v27,%%v6 \n\t"
|
||||
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
|
||||
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
|
||||
"vfchdb %%v30, %%v27,%%v6 \n\t"
|
||||
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
|
||||
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
|
||||
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
|
||||
"clgrjl %2,%%r0,1b \n\t"
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
|
||||
//xtract index
|
||||
"vrepg %%v26,%%v6,1 \n\t"
|
||||
"vrepg %%v5,%%v7,1 \n\t"
|
||||
"vrepg %%v26,%%v6,1 \n\t"
|
||||
"vrepg %%v5,%%v7,1 \n\t"
|
||||
"wfcdb %%v26,%%v6 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v6,%1,0 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v6,%[maxf],0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v7 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"j 3 \n\t"
|
||||
"2: \n\t"
|
||||
"wfchdb %%v16,%%v26,%%v6 \n\t"
|
||||
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"3: \n\t"
|
||||
: "=r"(index),"=m"(*maxf),"+&a"(x)
|
||||
: "r"(n), "2"(x)
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
"j 3 \n\t"
|
||||
"2: \n\t"
|
||||
"wfchdb %%v16,%%v26,%%v6 \n\t"
|
||||
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
"std %%f0,%[maxf] \n\t"
|
||||
"3: \n\t"
|
||||
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)
|
||||
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
|
||||
|
@ -220,12 +220,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -8;
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = ziamax_kernel_8_TUNED(n1, x, &maxf);
|
||||
|
||||
max = ziamax_kernel_16_TUNED(n1, x, &maxf);
|
||||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
|
|
|
@ -35,16 +35,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
/**
|
||||
* Find minimum index
|
||||
* Warning: requirements n>0 and n % 8 == 0
|
||||
* Warning: requirements n>0 and n % 16 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param minf (out) minimum absolute value .( only for output )
|
||||
* @return minimum index
|
||||
*/
|
||||
static BLASLONG ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||
static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||
BLASLONG index ;
|
||||
__asm__(
|
||||
"pfd 1, 0(%4) \n\t"
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"vleig %%v16,0,0 \n\t"
|
||||
"vleig %%v16,1,1 \n\t"
|
||||
"vleig %%v17,2,0 \n\t"
|
||||
|
@ -61,143 +61,143 @@ static BLASLONG ziamin_kernel_8_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
|||
"vleig %%v22,13,1 \n\t"
|
||||
"vleig %%v23,14,0 \n\t"
|
||||
"vleig %%v23,15,1 \n\t"
|
||||
"ld %%f6,0(%4) \n\t"
|
||||
"lpdbr %%f6,%%f6 \n\t"
|
||||
"ld %%f7,8(%4) \n\t"
|
||||
"lpdbr %%f7,%%f7 \n\t"
|
||||
"adbr %%f6,%%f7 \n\t"
|
||||
"sllg %%r0,%3,4 \n\t"
|
||||
"agr %%r0,%4 \n\t"
|
||||
"vrepg %%v6,%%v6,0 \n\t"
|
||||
"vzero %%v7 \n\t"
|
||||
"vrepig %%v4,16 \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
"ld %%f6,0(%[ptr_x]) \n\t"
|
||||
"lpdbr %%f6,%%f6 \n\t"
|
||||
"ld %%f7,8(%[ptr_x]) \n\t"
|
||||
"lpdbr %%f7,%%f7 \n\t"
|
||||
"adbr %%f6,%%f7 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vrepg %%v6,%%v6,0 \n\t"
|
||||
"vzero %%v7 \n\t"
|
||||
"vrepig %%v4,16 \n\t"
|
||||
"vzero %%v5 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%2 ) \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
|
||||
"vleg %%v24 , 0( %2),0 \n\t"
|
||||
"vleg %%v25 , 8( %2),0 \n\t"
|
||||
"vleg %%v24 , 16( %2),1 \n\t"
|
||||
"vleg %%v25 , 24( %2),1 \n\t"
|
||||
"vleg %%v26 , 32( %2),0 \n\t"
|
||||
"vleg %%v27 , 40( %2),0 \n\t"
|
||||
"vleg %%v26 , 48( %2),1 \n\t"
|
||||
"vleg %%v27 , 56( %2),1 \n\t"
|
||||
"vleg %%v28 , 64( %2),0 \n\t"
|
||||
"vleg %%v29 , 72( %2),0 \n\t"
|
||||
"vleg %%v28 , 80( %2),1 \n\t"
|
||||
"vleg %%v29 , 88( %2),1 \n\t"
|
||||
"vleg %%v30 , 96( %2),0 \n\t"
|
||||
"vleg %%v31 ,104( %2),0 \n\t"
|
||||
"vleg %%v30 ,112( %2),1 \n\t"
|
||||
"vleg %%v31 ,120( %2),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v1,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v2,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v3,%%v30,%%v31 \n\t"
|
||||
"vfadb %%v0,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v1,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v2,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v3,%%v30,%%v31 \n\t"
|
||||
|
||||
|
||||
"vleg %%v24 ,128( %2),0 \n\t"
|
||||
"vleg %%v25 ,136( %2),0 \n\t"
|
||||
"vleg %%v24 ,144( %2),1 \n\t"
|
||||
"vleg %%v25 ,152( %2),1 \n\t"
|
||||
"vleg %%v26 ,160( %2),0 \n\t"
|
||||
"vleg %%v27 ,168( %2),0 \n\t"
|
||||
"vleg %%v26 ,176( %2),1 \n\t"
|
||||
"vleg %%v27 ,184( %2),1 \n\t"
|
||||
"vleg %%v28 ,192( %2),0 \n\t"
|
||||
"vleg %%v29 ,200( %2),0 \n\t"
|
||||
"vleg %%v28 ,208( %2),1 \n\t"
|
||||
"vleg %%v29 ,216( %2),1 \n\t"
|
||||
"vleg %%v30 ,224( %2),0 \n\t"
|
||||
"vleg %%v31 ,232( %2),0 \n\t"
|
||||
"vleg %%v30 ,240( %2),1 \n\t"
|
||||
"vleg %%v31 ,248( %2),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"vleg %%v24 ,128(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v25 ,136(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v24 ,144(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v25 ,152(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v26 ,160(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v27 ,168(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v26 ,176(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v27 ,184(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v28 ,192(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v29 ,200(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v28 ,208(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v29 ,216(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v30 ,224(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v31 ,232(%[ptr_tmp]),0 \n\t"
|
||||
"vleg %%v30 ,240(%[ptr_tmp]),1 \n\t"
|
||||
"vleg %%v31 ,248(%[ptr_tmp]),1 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v24,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v26,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v28,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v30,%%v30,%%v31 \n\t"
|
||||
"vfadb %%v24,%%v24,%%v25 \n\t"
|
||||
"vfadb %%v26,%%v26,%%v27 \n\t"
|
||||
"vfadb %%v28,%%v28,%%v29 \n\t"
|
||||
"vfadb %%v30,%%v30,%%v31 \n\t"
|
||||
|
||||
|
||||
"vfchdb %%v25,%%v0 ,%%v1 \n\t"
|
||||
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
|
||||
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
|
||||
"vfchdb %%v25,%%v0 ,%%v1 \n\t"
|
||||
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
|
||||
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v2,%%v3 \n\t"
|
||||
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
|
||||
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
|
||||
"vfchdb %%v27,%%v2,%%v3 \n\t"
|
||||
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
|
||||
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v25,%%v24,%%v26 \n\t"
|
||||
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
|
||||
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
|
||||
"vfchdb %%v25,%%v24,%%v26 \n\t"
|
||||
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
|
||||
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
|
||||
|
||||
"vfchdb %%v27,%%v28,%%v30 \n\t"
|
||||
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
|
||||
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
|
||||
"vfchdb %%v27,%%v28,%%v30 \n\t"
|
||||
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
|
||||
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
|
||||
|
||||
"vfchdb %%v24,%%v31, %%v1 \n\t"
|
||||
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
|
||||
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
|
||||
"vfchdb %%v24,%%v31, %%v1 \n\t"
|
||||
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
|
||||
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
|
||||
|
||||
"vfchdb %%v30,%%v3, %%v27 \n\t"
|
||||
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
|
||||
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
|
||||
"vfchdb %%v30,%%v3, %%v27 \n\t"
|
||||
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
|
||||
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
|
||||
|
||||
"la %2,256(%2) \n\t"
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
|
||||
"vfchdb %%v0,%%v28, %%v31 \n\t"
|
||||
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
|
||||
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
|
||||
"vfchdb %%v0,%%v28, %%v31 \n\t"
|
||||
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
|
||||
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
|
||||
|
||||
"vag %%v25,%%v25,%%v5 \n\t"
|
||||
"vag %%v25,%%v25,%%v5 \n\t"
|
||||
|
||||
//cmp with previous
|
||||
"vfchdb %%v30,%%v6 , %%v27 \n\t"
|
||||
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
|
||||
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
|
||||
"vfchdb %%v30,%%v6 , %%v27 \n\t"
|
||||
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
|
||||
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
|
||||
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
"vag %%v5,%%v5,%%v4 \n\t"
|
||||
|
||||
"clgrjl %2,%%r0,1b \n\t"
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
|
||||
//xtract index
|
||||
"vrepg %%v26,%%v6,1 \n\t"
|
||||
"vrepg %%v5,%%v7,1 \n\t"
|
||||
"wfcdb %%v26,%%v6 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v6,%1,0 \n\t"
|
||||
"vrepg %%v26,%%v6,1 \n\t"
|
||||
"vrepg %%v5,%%v7,1 \n\t"
|
||||
"wfcdb %%v26,%%v6 \n\t"
|
||||
"jne 2f \n\t"
|
||||
"vsteg %%v6,%[minf],0 \n\t"
|
||||
"vmnlg %%v1,%%v5,%%v7 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"j 3f \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
"j 3f \n\t"
|
||||
"2: \n\t"
|
||||
"wfchdb %%v16,%%v6 ,%%v26 \n\t"
|
||||
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
|
||||
"vlgvg %0,%%v1,0 \n\t"
|
||||
"std %%f0,%1 \n\t"
|
||||
"wfchdb %%v16,%%v6 ,%%v26 \n\t"
|
||||
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
|
||||
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
|
||||
"vlgvg %[index],%%v1,0 \n\t"
|
||||
"std %%f0,%[minf] \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
: "+r"(index) ,"=m"(*minf), "+&a"(x)
|
||||
: "r"(n), "2"(x)
|
||||
: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)
|
||||
: "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
|
||||
|
@ -224,12 +224,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -8;
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = ziamin_kernel_8_TUNED(n1, x, &minf);
|
||||
|
||||
min = ziamin_kernel_16_TUNED(n1, x, &minf);
|
||||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
else {
|
||||
//assign minf
|
||||
|
|
|
@ -44,65 +44,65 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
|
|||
|
||||
FLOAT asum;
|
||||
__asm__ (
|
||||
"pfd 1, 0(%3) \n\t"
|
||||
"sllg %%r0,%2,4 \n\t"
|
||||
"agr %%r0,%3 \n\t"
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v22 \n\t"
|
||||
"vzero %%v23 \n\t"
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[ptr_x] \n\t"
|
||||
"vzero %%v0 \n\t"
|
||||
"vzero %%v1 \n\t"
|
||||
"vzero %%v22 \n\t"
|
||||
"vzero %%v23 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%1 ) \n\t"
|
||||
"vlm %%v24,%%v31,0(%1) \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%[ptr_tmp] ) \n\t"
|
||||
"vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t"
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
|
||||
"vfadb %%v0,%%v0,%%v24 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v25 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v26 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v27 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v28 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v29 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v30 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v31 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v24 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v25 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v26 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v27 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v28 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v29 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v30 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v31 \n\t"
|
||||
|
||||
"vlm %%v24,%%v31, 128(%1 ) \n\t"
|
||||
"vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t"
|
||||
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"la %1,256(%1) \n\t"
|
||||
"vfadb %%v0,%%v0,%%v24 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v25 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v26 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v27 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v28 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v29 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v30 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v31 \n\t"
|
||||
"vflpdb %%v24, %%v24 \n\t"
|
||||
"vflpdb %%v25, %%v25 \n\t"
|
||||
"vflpdb %%v26, %%v26 \n\t"
|
||||
"vflpdb %%v27, %%v27 \n\t"
|
||||
"vflpdb %%v28, %%v28 \n\t"
|
||||
"vflpdb %%v29, %%v29 \n\t"
|
||||
"vflpdb %%v30, %%v30 \n\t"
|
||||
"vflpdb %%v31, %%v31 \n\t"
|
||||
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
|
||||
"vfadb %%v0,%%v0,%%v24 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v25 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v26 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v27 \n\t"
|
||||
"vfadb %%v0,%%v0,%%v28 \n\t"
|
||||
"vfadb %%v1,%%v1,%%v29 \n\t"
|
||||
"vfadb %%v23,%%v23,%%v30 \n\t"
|
||||
"vfadb %%v22,%%v22,%%v31 \n\t"
|
||||
|
||||
"clgrjl %1,%%r0,1b \n\t"
|
||||
"vfadb %%v24,%%v0,%%v1 \n\t"
|
||||
"vfadb %%v25,%%v23,%%v22 \n\t"
|
||||
"vfadb %%v0,%%v25,%%v24 \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %0 ,%%f0"
|
||||
: "=f"(asum),"+&a"(x)
|
||||
: "r"(n), "1"(x)
|
||||
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
|
||||
"vfadb %%v24,%%v0,%%v1 \n\t"
|
||||
"vfadb %%v25,%%v23,%%v22 \n\t"
|
||||
"vfadb %%v0,%%v25,%%v24 \n\t"
|
||||
"vrepg %%v1,%%v0,1 \n\t"
|
||||
"adbr %%f0,%%f1 \n\t"
|
||||
"ldr %[asum] ,%%f0"
|
||||
: [asum] "=f"(asum),[ptr_tmp] "+&a"(x)
|
||||
: [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x)
|
||||
: "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return asum;
|
||||
|
|
|
@ -28,36 +28,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) {
|
||||
__asm__ ("pfd 1, 0(%1) \n\t"
|
||||
"pfd 2, 0(%2) \n\t"
|
||||
"vlrepg %%v28 , 0(%3) \n\t"
|
||||
"vlrepg %%v29, 8(%3) \n\t"
|
||||
"srlg %3,%0,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) {
|
||||
__asm__ ("pfd 1, 0(%[x_tmp]) \n\t"
|
||||
"pfd 2, 0(%[y_tmp]) \n\t"
|
||||
"lgdr %%r1,%[alpha_r] \n\t"
|
||||
"vlvgp %%v28,%%r1,%%r1 \n\t"
|
||||
"lgdr %%r1,%[alpha_i] \n\t"
|
||||
"vlvgp %%v29,%%r1,%%r1 \n\t"
|
||||
"sllg %[tmp],%[tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%1) \n\t"
|
||||
"pfd 2, 256(%%r1,%2) \n\t"
|
||||
"vleg %%v16 , 0(%%r1,%2),0 \n\t"
|
||||
"vleg %%v17 , 8(%%r1,%2),0 \n\t"
|
||||
"vleg %%v16 , 16(%%r1,%2),1 \n\t"
|
||||
"vleg %%v17 , 24(%%r1,%2),1 \n\t"
|
||||
|
||||
"vleg %%v18 , 32(%%r1,%2),0 \n\t"
|
||||
"vleg %%v19 , 40(%%r1,%2),0 \n\t"
|
||||
"vleg %%v18 , 48(%%r1,%2),1 \n\t"
|
||||
"vleg %%v19 , 56(%%r1,%2),1 \n\t"
|
||||
|
||||
"vleg %%v24 , 0(%%r1,%1),0 \n\t"
|
||||
"vleg %%v25 , 8(%%r1,%1),0 \n\t"
|
||||
"vleg %%v24 , 16(%%r1,%1),1 \n\t"
|
||||
"vleg %%v25 , 24(%%r1,%1),1 \n\t"
|
||||
|
||||
"vleg %%v26 , 32(%%r1,%1),0 \n\t"
|
||||
"vleg %%v27 , 40(%%r1,%1),0 \n\t"
|
||||
"vleg %%v26 , 48(%%r1,%1),1 \n\t"
|
||||
"vleg %%v27 , 56(%%r1,%1),1 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%[x_tmp]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[y_tmp]) \n\t"
|
||||
"vleg %%v16 , 0(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vleg %%v17 , 8(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vleg %%v16 , 16(%%r1,%[y_tmp]),1 \n\t"
|
||||
"vleg %%v17 , 24(%%r1,%[y_tmp]),1 \n\t"
|
||||
"vleg %%v18 , 32(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vleg %%v19 , 40(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vleg %%v18 , 48(%%r1,%[y_tmp]),1 \n\t"
|
||||
"vleg %%v19 , 56(%%r1,%[y_tmp]),1 \n\t"
|
||||
"vleg %%v24 , 0(%%r1,%[x_tmp]),0 \n\t"
|
||||
"vleg %%v25 , 8(%%r1,%[x_tmp]),0 \n\t"
|
||||
"vleg %%v24 , 16(%%r1,%[x_tmp]),1 \n\t"
|
||||
"vleg %%v25 , 24(%%r1,%[x_tmp]),1 \n\t"
|
||||
"vleg %%v26 , 32(%%r1,%[x_tmp]),0 \n\t"
|
||||
"vleg %%v27 , 40(%%r1,%[x_tmp]),0 \n\t"
|
||||
"vleg %%v26 , 48(%%r1,%[x_tmp]),1 \n\t"
|
||||
"vleg %%v27 , 56(%%r1,%[x_tmp]),1 \n\t"
|
||||
#if !defined(CONJ)
|
||||
"vfmsdb %%v16, %%v25, %%v29,%%v16 \n\t"
|
||||
"vfmadb %%v17, %%v24, %%v29, %%v17 \n\t"
|
||||
|
@ -79,35 +79,35 @@ static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOA
|
|||
"vfmsdb %%v19, %%v26, %%v29, %%v19 \n\t"
|
||||
|
||||
#endif
|
||||
"vsteg %%v16 , 0(%%r1,%2),0 \n\t"
|
||||
"vsteg %%v17 , 8(%%r1,%2),0 \n\t"
|
||||
"vsteg %%v16 , 16(%%r1,%2),1 \n\t"
|
||||
"vsteg %%v17 , 24(%%r1,%2),1 \n\t"
|
||||
"vsteg %%v16 , 0(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vsteg %%v17 , 8(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vsteg %%v16 , 16(%%r1,%[y_tmp]),1 \n\t"
|
||||
"vsteg %%v17 , 24(%%r1,%[y_tmp]),1 \n\t"
|
||||
|
||||
"vsteg %%v18 , 32(%%r1,%2),0 \n\t"
|
||||
"vsteg %%v19 , 40(%%r1,%2),0 \n\t"
|
||||
"vsteg %%v18 , 48(%%r1,%2),1 \n\t"
|
||||
"vsteg %%v19 , 56(%%r1,%2),1 \n\t"
|
||||
"vsteg %%v18 , 32(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vsteg %%v19 , 40(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vsteg %%v18 , 48(%%r1,%[y_tmp]),1 \n\t"
|
||||
"vsteg %%v19 , 56(%%r1,%[y_tmp]),1 \n\t"
|
||||
|
||||
"vleg %%v20 , 64(%%r1,%2),0 \n\t"
|
||||
"vleg %%v21 , 72(%%r1,%2),0 \n\t"
|
||||
"vleg %%v20 , 80(%%r1,%2),1 \n\t"
|
||||
"vleg %%v21 , 88(%%r1,%2),1 \n\t"
|
||||
"vleg %%v20 , 64(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vleg %%v21 , 72(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vleg %%v20 , 80(%%r1,%[y_tmp]),1 \n\t"
|
||||
"vleg %%v21 , 88(%%r1,%[y_tmp]),1 \n\t"
|
||||
|
||||
"vleg %%v22 , 96(%%r1,%2),0 \n\t"
|
||||
"vleg %%v23 , 104(%%r1,%2),0 \n\t"
|
||||
"vleg %%v22 , 112(%%r1,%2),1 \n\t"
|
||||
"vleg %%v23 , 120(%%r1,%2),1 \n\t"
|
||||
"vleg %%v22 , 96(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vleg %%v23 , 104(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vleg %%v22 , 112(%%r1,%[y_tmp]),1 \n\t"
|
||||
"vleg %%v23 , 120(%%r1,%[y_tmp]),1 \n\t"
|
||||
|
||||
"vleg %%v24 , 64(%%r1,%1),0 \n\t"
|
||||
"vleg %%v25 , 72(%%r1,%1),0 \n\t"
|
||||
"vleg %%v24 , 80(%%r1,%1),1 \n\t"
|
||||
"vleg %%v25 , 88(%%r1,%1),1 \n\t"
|
||||
"vleg %%v24 , 64(%%r1,%[x_tmp]),0 \n\t"
|
||||
"vleg %%v25 , 72(%%r1,%[x_tmp]),0 \n\t"
|
||||
"vleg %%v24 , 80(%%r1,%[x_tmp]),1 \n\t"
|
||||
"vleg %%v25 , 88(%%r1,%[x_tmp]),1 \n\t"
|
||||
|
||||
"vleg %%v26 , 96(%%r1,%1),0 \n\t"
|
||||
"vleg %%v27 , 104(%%r1,%1),0 \n\t"
|
||||
"vleg %%v26 , 112(%%r1,%1),1 \n\t"
|
||||
"vleg %%v27 , 120(%%r1,%1),1 \n\t"
|
||||
"vleg %%v26 , 96(%%r1,%[x_tmp]),0 \n\t"
|
||||
"vleg %%v27 , 104(%%r1,%[x_tmp]),0 \n\t"
|
||||
"vleg %%v26 , 112(%%r1,%[x_tmp]),1 \n\t"
|
||||
"vleg %%v27 , 120(%%r1,%[x_tmp]),1 \n\t"
|
||||
#if !defined(CONJ)
|
||||
"vfmsdb %%v20, %%v25, %%v29,%%v20 \n\t"
|
||||
"vfmadb %%v21, %%v24, %%v29, %%v21 \n\t"
|
||||
|
@ -128,21 +128,21 @@ static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOA
|
|||
"vfmadb %%v22, %%v26, %%v28, %%v22 \n\t"
|
||||
"vfmsdb %%v23, %%v26, %%v29, %%v23 \n\t"
|
||||
#endif
|
||||
"vsteg %%v20 , 64(%%r1,%2),0 \n\t"
|
||||
"vsteg %%v21 , 72(%%r1,%2),0 \n\t"
|
||||
"vsteg %%v20 , 80(%%r1,%2),1 \n\t"
|
||||
"vsteg %%v21 , 88(%%r1,%2),1 \n\t"
|
||||
"vsteg %%v20 , 64(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vsteg %%v21 , 72(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vsteg %%v20 , 80(%%r1,%[y_tmp]),1 \n\t"
|
||||
"vsteg %%v21 , 88(%%r1,%[y_tmp]),1 \n\t"
|
||||
|
||||
"vsteg %%v22 , 96(%%r1,%2),0 \n\t"
|
||||
"vsteg %%v23 , 104(%%r1,%2),0 \n\t"
|
||||
"vsteg %%v22 , 112(%%r1,%2),1 \n\t"
|
||||
"vsteg %%v23 , 120(%%r1,%2),1 \n\t"
|
||||
"vsteg %%v22 , 96(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vsteg %%v23 , 104(%%r1,%[y_tmp]),0 \n\t"
|
||||
"vsteg %%v22 , 112(%%r1,%[y_tmp]),1 \n\t"
|
||||
"vsteg %%v23 , 120(%%r1,%[y_tmp]),1 \n\t"
|
||||
|
||||
"la %%r1,128(%%r1) \n\t"
|
||||
"brctg %3,1b"
|
||||
:
|
||||
: "r"(n), "a"(x), "a"(y), "a"(alpha)
|
||||
: "cc", "memory", "r1","v16",
|
||||
"la %%r1,128(%%r1) \n\t"
|
||||
"clgrjl %%r1,%[tmp],1b \n\t"
|
||||
: [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n)
|
||||
: [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
|
||||
: "cc", "r1","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29"
|
||||
);
|
||||
|
||||
|
@ -151,7 +151,6 @@ static void __attribute__ ((noinline)) zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOA
|
|||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
FLOAT da[2];
|
||||
|
||||
if (n <= 0) return (0);
|
||||
|
||||
|
@ -160,9 +159,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
BLASLONG n1 = n & -8;
|
||||
|
||||
if (n1) {
|
||||
da[0] = da_r;
|
||||
da[1] = da_i;
|
||||
zaxpy_kernel_8(n1, x, y, da);
|
||||
zaxpy_kernel_8(n1, x, y, da_r,da_i);
|
||||
ix = 2 * n1;
|
||||
}
|
||||
i = n1;
|
||||
|
|
|
@ -27,62 +27,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
static void __attribute__ ((noinline)) zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%1) \n\t"
|
||||
"pfd 2, 0(%2) \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%1) \n\t"
|
||||
"pfd 2, 256(%%r1,%2) \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vst %%v24, 0(%%r1,%2) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vst %%v25, 16(%%r1,%2) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vst %%v26, 32(%%r1,%2) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vst %%v27, 48(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v28, 64(%%r1,%1) \n\t"
|
||||
"vst %%v28, 64(%%r1,%2) \n\t"
|
||||
"vl %%v29, 80(%%r1,%1) \n\t"
|
||||
"vst %%v29, 80(%%r1,%2) \n\t"
|
||||
"vl %%v30, 96(%%r1,%1) \n\t"
|
||||
"vst %%v30, 96(%%r1,%2) \n\t"
|
||||
"vl %%v31,112(%%r1,%1) \n\t"
|
||||
"vst %%v31,112(%%r1,%2) \n\t"
|
||||
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"vl %%v24,128(%%r1,%1) \n\t"
|
||||
"vst %%v24,128(%%r1,%2) \n\t"
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v25,144(%%r1,%1) \n\t"
|
||||
"vst %%v25,144(%%r1,%2) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v26,160(%%r1,%1) \n\t"
|
||||
"vst %%v26,160(%%r1,%2) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v27,176(%%r1,%1) \n\t"
|
||||
"vst %%v27,176(%%r1,%2) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %%r0,1b"
|
||||
:
|
||||
: "r"(n), "a"(x), "a"(y)
|
||||
: "cc", "memory","r0","r1","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n)
|
||||
: [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
|
|
|
@ -32,75 +32,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
|
||||
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%2) \n\t"
|
||||
"pfd 1, 0(%3) \n\t"
|
||||
"pfd 1, 0(%[ptr_x_tmp]) \n\t"
|
||||
"pfd 1, 0(%[ptr_y_tmp]) \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"srlg %1,%1,3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],3 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%2) \n\t"
|
||||
"pfd 1, 256(%%r1,%3) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"vl %%v28, 0(%%r1,%3) \n\t"
|
||||
"vl %%v29, 16(%%r1,%3) \n\t"
|
||||
"vl %%v30, 32(%%r1,%3) \n\t"
|
||||
"vl %%v31, 48(%%r1,%3) \n\t"
|
||||
"vpdi %%v20,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v21,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v22,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v23,%%v19,%%v19,4 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vpdi %%v20,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v21,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v22,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v23,%%v19,%%v19,4 \n\t"
|
||||
|
||||
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
|
||||
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
|
||||
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"
|
||||
|
||||
|
||||
|
||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
||||
"vl %%v19,112(%%r1,%2) \n\t"
|
||||
"vl %%v28, 64(%%r1,%3) \n\t"
|
||||
"vl %%v29, 80(%%r1,%3) \n\t"
|
||||
"vl %%v30, 96(%%r1,%3) \n\t"
|
||||
"vl %%v31,112(%%r1,%3) \n\t"
|
||||
"vpdi %%v20,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v21,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v22,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v23,%%v19,%%v19,4 \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
|
||||
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"
|
||||
"vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t"
|
||||
"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t"
|
||||
"vpdi %%v20,%%v16,%%v16,4 \n\t"
|
||||
"vpdi %%v21,%%v17,%%v17,4 \n\t"
|
||||
"vpdi %%v22,%%v18,%%v18,4 \n\t"
|
||||
"vpdi %%v23,%%v19,%%v19,4 \n\t"
|
||||
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
|
||||
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
|
||||
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
|
||||
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
|
||||
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"
|
||||
|
||||
|
||||
"la %%r1,128(%%r1) \n\t"
|
||||
"brctg %1,1b \n\t"
|
||||
"vfadb %%v24,%%v26,%%v24 \n\t"
|
||||
"vfadb %%v25,%%v25,%%v27 \n\t"
|
||||
"vsteg %%v24,0(%4),0 \n\t"
|
||||
"vsteg %%v24,8(%4),1 \n\t"
|
||||
"vsteg %%v25,16(%4),1 \n\t"
|
||||
"vsteg %%v25,24(%4),0 \n\t"
|
||||
: "=m"(*d) ,"+&r"(n)
|
||||
: "a"(x), "a"(y), "a"(d)
|
||||
"la %%r1,128(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b \n\t"
|
||||
"vfadb %%v24,%%v26,%%v24 \n\t"
|
||||
"vfadb %%v25,%%v25,%%v27 \n\t"
|
||||
"vsteg %%v24, 0(%[ptr_d]),0 \n\t"
|
||||
"vsteg %%v24, 8(%[ptr_d]),1 \n\t"
|
||||
"vsteg %%v25,16(%[ptr_d]),1 \n\t"
|
||||
"vsteg %%v25,24(%[ptr_d]),0 \n\t"
|
||||
: [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n)
|
||||
: [mem_x] "m"( *(const double (*)[2*n])x),
|
||||
[mem_y] "m"( *(const double (*)[2*n])y),
|
||||
[ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d)
|
||||
: "cc", "r1","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
@ -150,8 +152,8 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
|
|||
#endif
|
||||
|
||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||
BLASLONG i;
|
||||
BLASLONG ix, iy;
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix=0, iy=0;
|
||||
OPENBLAS_COMPLEX_FLOAT result;
|
||||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
|
@ -164,13 +166,15 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
BLASLONG n1 = n & -8;
|
||||
BLASLONG j=0;
|
||||
|
||||
if (n1)
|
||||
if (n1){
|
||||
zdot_kernel_8(n1, x, y, dot);
|
||||
i = n1;
|
||||
j = n1 <<1;
|
||||
}
|
||||
|
||||
i = n1;
|
||||
BLASLONG j = i * 2;
|
||||
|
||||
while (i < n) {
|
||||
|
||||
|
|
|
@ -25,40 +25,40 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
|
||||
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
|
||||
{
|
||||
__asm__ (
|
||||
"pfd 2, 0(%1) \n\t"
|
||||
"pfd 2, 0(%2) \n\t"
|
||||
|
||||
"vlrepg %%v0,0(%3) \n\t"
|
||||
"vlrepg %%v1,0(%4) \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
"pfd 2, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"lgdr %%r1,%[cos] \n\t"
|
||||
"vlvgp %%v0,%%r1,%%r1 \n\t"
|
||||
"lgdr %%r1,%[sin] \n\t"
|
||||
"vlvgp %%v1,%%r1,%%r1 \n\t"
|
||||
"sllg %[tmp],%[tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%1) \n\t"
|
||||
"pfd 2, 256(%%r1,%2) \n\t"
|
||||
"vl %%v24, 0(%%r1,%1) \n\t"
|
||||
"vl %%v25, 16(%%r1,%1) \n\t"
|
||||
"vl %%v26, 32(%%r1,%1) \n\t"
|
||||
"vl %%v27, 48(%%r1,%1) \n\t"
|
||||
"vl %%v16, 0(%%r1,%2) \n\t"
|
||||
"vl %%v17, 16(%%r1,%2) \n\t"
|
||||
"vl %%v18, 32(%%r1,%2) \n\t"
|
||||
"vl %%v19, 48(%%r1,%2) \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
|
@ -69,34 +69,32 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO
|
|||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27,112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19,112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vst %%v28, 0(%%r1,%1) \n\t"
|
||||
"vst %%v29, 16(%%r1,%1) \n\t"
|
||||
"vst %%v30, 32(%%r1,%1) \n\t"
|
||||
"vst %%v31, 48(%%r1,%1) \n\t"
|
||||
"vst %%v20, 0(%%r1,%2) \n\t"
|
||||
"vst %%v21, 16(%%r1,%2) \n\t"
|
||||
"vst %%v22, 32(%%r1,%2) \n\t"
|
||||
"vst %%v23, 48(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 64(%%r1,%1) \n\t"
|
||||
"vl %%v25, 80(%%r1,%1) \n\t"
|
||||
"vl %%v26, 96(%%r1,%1) \n\t"
|
||||
"vl %%v27,112(%%r1,%1) \n\t"
|
||||
"vl %%v16, 64(%%r1,%2) \n\t"
|
||||
"vl %%v17, 80(%%r1,%2) \n\t"
|
||||
"vl %%v18, 96(%%r1,%2) \n\t"
|
||||
"vl %%v19,112(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
|
@ -107,34 +105,32 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO
|
|||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vst %%v28, 64(%%r1,%1) \n\t"
|
||||
"vst %%v29, 80(%%r1,%1) \n\t"
|
||||
"vst %%v30, 96(%%r1,%1) \n\t"
|
||||
"vst %%v31, 112(%%r1,%1) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v16, 128(%%r1,%2) \n\t"
|
||||
"vl %%v17, 144(%%r1,%2) \n\t"
|
||||
"vl %%v18, 160(%%r1,%2) \n\t"
|
||||
"vl %%v19, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
|
@ -145,34 +141,32 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO
|
|||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vst %%v28, 128(%%r1,%1) \n\t"
|
||||
"vst %%v29, 144(%%r1,%1) \n\t"
|
||||
"vst %%v30, 160(%%r1,%1) \n\t"
|
||||
"vst %%v31, 176(%%r1,%1) \n\t"
|
||||
"vst %%v20, 128(%%r1,%2) \n\t"
|
||||
"vst %%v21, 144(%%r1,%2) \n\t"
|
||||
"vst %%v22, 160(%%r1,%2) \n\t"
|
||||
"vst %%v23, 176(%%r1,%2) \n\t"
|
||||
|
||||
"vl %%v24, 192(%%r1,%1) \n\t"
|
||||
"vl %%v25, 208(%%r1,%1) \n\t"
|
||||
"vl %%v26, 224(%%r1,%1) \n\t"
|
||||
"vl %%v27, 240(%%r1,%1) \n\t"
|
||||
"vl %%v16, 192(%%r1,%2) \n\t"
|
||||
"vl %%v17, 208(%%r1,%2) \n\t"
|
||||
"vl %%v18, 224(%%r1,%2) \n\t"
|
||||
"vl %%v19, 240(%%r1,%2) \n\t"
|
||||
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v28,%%v24,%%v0 \n\t"
|
||||
"vfmdb %%v29,%%v25,%%v0 \n\t"
|
||||
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v30,%%v26,%%v0 \n\t"
|
||||
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
|
||||
"vfmdb %%v31,%%v27,%%v0 \n\t"
|
||||
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
|
||||
/* 2nd parts*/
|
||||
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
|
||||
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
|
||||
|
@ -183,32 +177,28 @@ static void __attribute__ ((noinline)) zrot_kernel_16(BLASLONG n, FLOAT *x, FLO
|
|||
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
|
||||
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
|
||||
|
||||
"vst %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"vst %%v28, 192(%%r1,%1) \n\t"
|
||||
"vst %%v29, 208(%%r1,%1) \n\t"
|
||||
"vst %%v30, 224(%%r1,%1) \n\t"
|
||||
"vst %%v31, 240(%%r1,%1) \n\t"
|
||||
"vst %%v20, 192(%%r1,%2) \n\t"
|
||||
"vst %%v21, 208(%%r1,%2) \n\t"
|
||||
"vst %%v22, 224(%%r1,%2) \n\t"
|
||||
"vst %%v23, 240(%%r1,%2) \n\t"
|
||||
|
||||
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %%r0,1b"
|
||||
:
|
||||
: "r"(n), "a"(x), "a"(y),"a"(c),"a"(s)
|
||||
: "cc", "memory","r0","r1" ,"v0","v1","v16",
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"clgrjl %%r1,%[tmp],1b \n\t"
|
||||
: [mem_x] "+m" (*(double (*)[2*n])x),
|
||||
[mem_y] "+m" (*(double (*)[2*n])y),
|
||||
[tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
|
||||
: "cc","r1" ,"v0","v1","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
|
@ -225,10 +215,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
FLOAT cosa,sina;
|
||||
cosa=c;
|
||||
sina=s;
|
||||
zrot_kernel_16(n1, x, y, &cosa, &sina);
|
||||
zrot_kernel_16(n1, x, y, c, s);
|
||||
i=n1;
|
||||
ix=2*n1;
|
||||
}
|
||||
|
@ -247,7 +234,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -273,4 +259,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -29,229 +29,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
|
||||
static void __attribute__ ((noinline)) zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) {
|
||||
static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) {
|
||||
__asm__(
|
||||
|
||||
"pfd 1, 0(%1) \n\t"
|
||||
"sllg %%r0,%0,4 \n\t"
|
||||
"agr %%r0,%2 \n\t"
|
||||
"vlrepg %%v24,0(%1) \n\t"
|
||||
"vlrepg %%v25,8(%1) \n\t"
|
||||
"pfd 1, 0(%[x_ptr]) \n\t"
|
||||
"lgdr %%r0,%[alpha_r] \n\t"
|
||||
"vlvgp %%v24,%%r0,%%r0 \n\t"
|
||||
"lgdr %%r0,%[alpha_i] \n\t"
|
||||
"vlvgp %%v25,%%r0,%%r0 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%2 ) \n\t"
|
||||
|
||||
"vleg %%v20 , 0(%2),0 \n\t"
|
||||
"vleg %%v21 , 8(%2),0 \n\t"
|
||||
"vleg %%v20 , 16(%2),1 \n\t"
|
||||
"vleg %%v21 , 24(%2),1 \n\t"
|
||||
|
||||
"vleg %%v22 , 32(%2),0 \n\t"
|
||||
"vleg %%v23 , 40(%2),0 \n\t"
|
||||
"vleg %%v22 , 48(%2),1 \n\t"
|
||||
"vleg %%v23 , 56(%2),1 \n\t"
|
||||
|
||||
"vfmdb %%v16, %%v21, %%v25 \n\t"
|
||||
"vfmdb %%v17, %%v20, %%v25 \n\t"
|
||||
"vfmdb %%v18, %%v23, %%v25 \n\t"
|
||||
"vfmdb %%v19, %%v22, %%v25 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%[x_ptr] ) \n\t"
|
||||
|
||||
"vleg %%v20 , 0(%[x_ptr]),0 \n\t"
|
||||
"vleg %%v21 , 8(%[x_ptr]),0 \n\t"
|
||||
"vleg %%v20 , 16(%[x_ptr]),1 \n\t"
|
||||
"vleg %%v21 , 24(%[x_ptr]),1 \n\t"
|
||||
"vleg %%v22 , 32(%[x_ptr]),0 \n\t"
|
||||
"vleg %%v23 , 40(%[x_ptr]),0 \n\t"
|
||||
"vleg %%v22 , 48(%[x_ptr]),1 \n\t"
|
||||
"vleg %%v23 , 56(%[x_ptr]),1 \n\t"
|
||||
"vfmdb %%v16, %%v21, %%v25 \n\t"
|
||||
"vfmdb %%v17, %%v20, %%v25 \n\t"
|
||||
"vfmdb %%v18, %%v23, %%v25 \n\t"
|
||||
"vfmdb %%v19, %%v22, %%v25 \n\t"
|
||||
"vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t"
|
||||
"vfmadb %%v17, %%v21, %%v24, %%v17 \n\t"
|
||||
"vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v23, %%v24, %%v19 \n\t"
|
||||
|
||||
"vsteg %%v16 , 0(%2),0 \n\t"
|
||||
"vsteg %%v17 , 8(%2),0 \n\t"
|
||||
"vsteg %%v16 , 16(%2),1 \n\t"
|
||||
"vsteg %%v17 , 24(%2),1 \n\t"
|
||||
|
||||
"vsteg %%v18 , 32(%2),0 \n\t"
|
||||
"vsteg %%v19 , 40(%2),0 \n\t"
|
||||
"vsteg %%v18 , 48(%2),1 \n\t"
|
||||
"vsteg %%v19 , 56(%2),1 \n\t"
|
||||
|
||||
"vleg %%v20 , 64(%2),0 \n\t"
|
||||
"vleg %%v21 , 72(%2),0 \n\t"
|
||||
"vleg %%v20 , 80(%2),1 \n\t"
|
||||
"vleg %%v21 , 88(%2),1 \n\t"
|
||||
|
||||
"vleg %%v22 , 96(%2),0 \n\t"
|
||||
"vleg %%v23 , 104(%2),0 \n\t"
|
||||
"vleg %%v22 , 112(%2),1 \n\t"
|
||||
"vleg %%v23 , 120(%2),1 \n\t"
|
||||
|
||||
"vfmdb %%v16, %%v21, %%v25 \n\t"
|
||||
"vfmdb %%v17, %%v20, %%v25 \n\t"
|
||||
"vfmdb %%v18, %%v23, %%v25 \n\t"
|
||||
"vfmdb %%v19, %%v22, %%v25 \n\t"
|
||||
|
||||
"vsteg %%v16 , 0(%[x_ptr]),0 \n\t"
|
||||
"vsteg %%v17 , 8(%[x_ptr]),0 \n\t"
|
||||
"vsteg %%v16 , 16(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v17 , 24(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v18 , 32(%[x_ptr]),0 \n\t"
|
||||
"vsteg %%v19 , 40(%[x_ptr]),0 \n\t"
|
||||
"vsteg %%v18 , 48(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v19 , 56(%[x_ptr]),1 \n\t"
|
||||
"vleg %%v20 , 64(%[x_ptr]),0 \n\t"
|
||||
"vleg %%v21 , 72(%[x_ptr]),0 \n\t"
|
||||
"vleg %%v20 , 80(%[x_ptr]),1 \n\t"
|
||||
"vleg %%v21 , 88(%[x_ptr]),1 \n\t"
|
||||
"vleg %%v22 , 96(%[x_ptr]),0 \n\t"
|
||||
"vleg %%v23 , 104(%[x_ptr]),0 \n\t"
|
||||
"vleg %%v22 , 112(%[x_ptr]),1 \n\t"
|
||||
"vleg %%v23 , 120(%[x_ptr]),1 \n\t"
|
||||
"vfmdb %%v16, %%v21, %%v25 \n\t"
|
||||
"vfmdb %%v17, %%v20, %%v25 \n\t"
|
||||
"vfmdb %%v18, %%v23, %%v25 \n\t"
|
||||
"vfmdb %%v19, %%v22, %%v25 \n\t"
|
||||
"vfmsdb %%v16, %%v20, %%v24 ,%%v16 \n\t"
|
||||
"vfmadb %%v17, %%v21, %%v24, %%v17 \n\t"
|
||||
"vfmsdb %%v18, %%v22, %%v24, %%v18 \n\t"
|
||||
"vfmadb %%v19, %%v23, %%v24, %%v19 \n\t"
|
||||
"vsteg %%v16 , 64(%[x_ptr]),0 \n\t"
|
||||
"vsteg %%v17 , 72(%[x_ptr]),0 \n\t"
|
||||
"vsteg %%v16 , 80(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v17 , 88(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v18 , 96(%[x_ptr]),0 \n\t"
|
||||
"vsteg %%v19 , 104(%[x_ptr]),0 \n\t"
|
||||
"vsteg %%v18 , 112(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v19 , 120(%[x_ptr]),1 \n\t"
|
||||
|
||||
"vsteg %%v16 , 64(%2),0 \n\t"
|
||||
"vsteg %%v17 , 72(%2),0 \n\t"
|
||||
"vsteg %%v16 , 80(%2),1 \n\t"
|
||||
"vsteg %%v17 , 88(%2),1 \n\t"
|
||||
|
||||
"vsteg %%v18 , 96(%2),0 \n\t"
|
||||
"vsteg %%v19 , 104(%2),0 \n\t"
|
||||
"vsteg %%v18 , 112(%2),1 \n\t"
|
||||
"vsteg %%v19 , 120(%2),1 \n\t"
|
||||
|
||||
"la %2,128(%2) \n\t"
|
||||
"clgrjl %2,%%r0,1b \n\t"
|
||||
:
|
||||
: "r"(n), "a"(alpha), "a"(x)
|
||||
"la %[x_ptr],128(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
|
||||
: "cc", "memory","r0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25"
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
|
||||
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) {
|
||||
|
||||
__asm__ ( "pfd 2, 0(%1) \n\t"
|
||||
"ld %%f0,8(%2) \n\t"
|
||||
"lcdbr %%f1,%%f0 \n\t"
|
||||
"lgdr %%r0,%%f1 \n\t"
|
||||
"vlvgg %%v0,%%r0,1 \n\t"
|
||||
"vlr %%v16,%%v0 \n\t"
|
||||
"vlr %%v17 ,%%v0 \n\t"
|
||||
"vlr %%v1,%%v0 \n\t"
|
||||
"sllg %%r0,%0,4 \n\t"
|
||||
"agr %%r0,%1 \n\t"
|
||||
__asm__ ( "pfd 2, 0(%1) \n\t"
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgg %%v16,%%r0,0 \n\t"
|
||||
"lcdbr %[alpha],%[alpha] \n\t"
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgg %%v16,%%r0,1 \n\t"
|
||||
"vlr %%v17 ,%%v16 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vl %%v24, 0(%1) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v0 \n\t"
|
||||
"vsteg %%v24, 0(%1),1 \n\t"
|
||||
"vsteg %%v24, 8(%1),0 \n\t"
|
||||
"vl %%v25, 16(%1) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v1 \n\t"
|
||||
"vsteg %%v25, 16(%1),1 \n\t"
|
||||
"vsteg %%v25, 24(%1),0 \n\t"
|
||||
"vl %%v26, 32(%1) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v16 \n\t"
|
||||
"vsteg %%v26, 32(%1),1 \n\t"
|
||||
"vsteg %%v26, 40(%1),0 \n\t"
|
||||
"vl %%v27, 48(%1) \n\t"
|
||||
"1: \n\t"
|
||||
"vl %%v24, 0(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v16 \n\t"
|
||||
"vsteg %%v24, 0(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v24, 8(%[x_ptr]),0 \n\t"
|
||||
"vl %%v25, 16(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v17 \n\t"
|
||||
"vsteg %%v25, 16(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v25, 24(%[x_ptr]),0 \n\t"
|
||||
"vl %%v26, 32(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v16 \n\t"
|
||||
"vsteg %%v26, 32(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v26, 40(%[x_ptr]),0 \n\t"
|
||||
"vl %%v27, 48(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v17 \n\t"
|
||||
"vsteg %%v27, 40(%1),1 \n\t"
|
||||
"vsteg %%v27, 48(%1),0 \n\t"
|
||||
"vl %%v28, 64(%1) \n\t"
|
||||
"vfmdb %%v28,%%v28,%%v0 \n\t"
|
||||
"vsteg %%v28, 64(%1),1 \n\t"
|
||||
"vsteg %%v28, 72(%1),0 \n\t"
|
||||
"vl %%v29, 80(%1) \n\t"
|
||||
"vfmdb %%v29,%%v29,%%v1 \n\t"
|
||||
"vsteg %%v29, 80(%1),1 \n\t"
|
||||
"vsteg %%v29, 88(%1),0 \n\t"
|
||||
"vl %%v30, 96(%1) \n\t"
|
||||
"vfmdb %%v30,%%v30,%%v16 \n\t"
|
||||
"vsteg %%v27, 96(%1),1 \n\t"
|
||||
"vsteg %%v27, 104(%1),0 \n\t"
|
||||
"vl %%v31, 112(%1) \n\t"
|
||||
"vsteg %%v27, 40(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v27, 48(%[x_ptr]),0 \n\t"
|
||||
"vl %%v28, 64(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v28,%%v28,%%v16 \n\t"
|
||||
"vsteg %%v28, 64(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v28, 72(%[x_ptr]),0 \n\t"
|
||||
"vl %%v29, 80(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v29,%%v29,%%v17 \n\t"
|
||||
"vsteg %%v29, 80(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v29, 88(%[x_ptr]),0 \n\t"
|
||||
"vl %%v30, 96(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v30,%%v30,%%v16 \n\t"
|
||||
"vsteg %%v27, 96(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v27, 104(%[x_ptr]),0 \n\t"
|
||||
"vl %%v31, 112(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v31,%%v31,%%v17 \n\t"
|
||||
"vsteg %%v31, 112(%1),1 \n\t"
|
||||
"vsteg %%v31, 120(%1),0 \n\t"
|
||||
"la %1,128(%1) \n\t"
|
||||
"clgrjl %1,%%r0,1b \n\t"
|
||||
:
|
||||
:"r"(n),"a"(x) ,"a"(alpha)
|
||||
:"cc", "memory","r0","f0", "f1","v0","v1","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
"vsteg %%v31, 112(%[x_ptr]),1 \n\t"
|
||||
"vsteg %%v31, 120(%[x_ptr]),0 \n\t"
|
||||
"la %[x_ptr],128(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n),[alpha] "f"(da_i)
|
||||
:"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) {
|
||||
__asm__ ("pfd 2, 0(%1) \n\t"
|
||||
"vlrepg %%v18,0(%2) \n\t"
|
||||
"vlr %%v19,%%v18 \n\t"
|
||||
"vlr %%v16 ,%%v18 \n\t"
|
||||
"vlr %%v17,%%v18 \n\t"
|
||||
"sllg %%r0,%0,4 \n\t"
|
||||
"agr %%r0,%1 \n\t"
|
||||
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) {
|
||||
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"lgdr %%r0,%[alpha] \n\t"
|
||||
"vlvgp %%v18,%%r0,%%r0 \n\t"
|
||||
"vlr %%v19,%%v18 \n\t"
|
||||
"vlr %%v16,%%v18 \n\t"
|
||||
"vlr %%v17,%%v18 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vl %%v24, 0(%1) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v18 \n\t"
|
||||
"vst %%v24, 0(%1) \n\t"
|
||||
"vl %%v25, 16(%1) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v19 \n\t"
|
||||
"vst %%v25, 16(%1) \n\t"
|
||||
"vl %%v26, 32(%1) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v16 \n\t"
|
||||
"vst %%v26, 32(%1) \n\t"
|
||||
"vl %%v27, 48(%1) \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v17 \n\t"
|
||||
"vst %%v27, 48(%1) \n\t"
|
||||
"vl %%v28, 64(%1) \n\t"
|
||||
"vfmdb %%v28,%%v28,%%v18 \n\t"
|
||||
"vst %%v28, 64(%1) \n\t"
|
||||
"vl %%v29, 80(%1) \n\t"
|
||||
"vfmdb %%v29,%%v29,%%v19 \n\t"
|
||||
"vst %%v29, 80(%1) \n\t"
|
||||
"vl %%v30, 96(%1) \n\t"
|
||||
"vfmdb %%v30,%%v30,%%v16 \n\t"
|
||||
"vst %%v30, 96(%1) \n\t"
|
||||
"vl %%v31, 112(%1) \n\t"
|
||||
"vfmdb %%v31,%%v31,%%v17 \n\t"
|
||||
"vst %%v31, 112(%1) \n\t"
|
||||
"la %1,128(%1) \n\t"
|
||||
"clgrjl %1,%%r0,1b \n\t"
|
||||
:
|
||||
:"r"(n),"a"(x) ,"a"(alpha)
|
||||
:"cc", "memory","r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
"1: \n\t"
|
||||
"vl %%v24, 0(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v24,%%v24,%%v18 \n\t"
|
||||
"vst %%v24, 0(%[x_ptr]) \n\t"
|
||||
"vl %%v25, 16(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v25,%%v25,%%v19 \n\t"
|
||||
"vst %%v25, 16(%[x_ptr]) \n\t"
|
||||
"vl %%v26, 32(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v26,%%v26,%%v16 \n\t"
|
||||
"vst %%v26, 32(%[x_ptr]) \n\t"
|
||||
"vl %%v27, 48(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v27,%%v27,%%v17 \n\t"
|
||||
"vst %%v27, 48(%[x_ptr]) \n\t"
|
||||
"vl %%v28, 64(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v28,%%v28,%%v18 \n\t"
|
||||
"vst %%v28, 64(%[x_ptr]) \n\t"
|
||||
"vl %%v29, 80(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v29,%%v29,%%v19 \n\t"
|
||||
"vst %%v29, 80(%[x_ptr]) \n\t"
|
||||
"vl %%v30, 96(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v30,%%v30,%%v16 \n\t"
|
||||
"vst %%v30, 96(%[x_ptr]) \n\t"
|
||||
"vl %%v31,112(%[x_ptr]) \n\t"
|
||||
"vfmdb %%v31,%%v31,%%v17 \n\t"
|
||||
"vst %%v31,112(%[x_ptr]) \n\t"
|
||||
"la %[x_ptr],128(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n),[alpha] "f"(da_r)
|
||||
: "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
|
||||
static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {
|
||||
|
||||
__asm__ ( "pfd 2, 0(%1) \n\t"
|
||||
__asm__ ( "pfd 2, 0(%[x_ptr]) \n\t"
|
||||
"vzero %%v24 \n\t"
|
||||
"vzero %%v25 \n\t"
|
||||
"vzero %%v26 \n\t"
|
||||
"vzero %%v27 \n\t"
|
||||
"sllg %%r0,%0,4 \n\t"
|
||||
"agr %%r0,%1 \n\t"
|
||||
"sllg %%r0,%[n],4 \n\t"
|
||||
"agr %%r0,%[x_ptr] \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256( %1) \n\t"
|
||||
"vst %%v24, 0( %1) \n\t"
|
||||
"vst %%v25, 16( %1) \n\t"
|
||||
"vst %%v26, 32( %1) \n\t"
|
||||
"vst %%v27, 48( %1) \n\t"
|
||||
"vst %%v24, 64( %1) \n\t"
|
||||
"vst %%v25, 80( %1) \n\t"
|
||||
"vst %%v26, 96( %1) \n\t"
|
||||
"vst %%v27,112( %1) \n\t"
|
||||
"pfd 2, 256( %[x_ptr]) \n\t"
|
||||
"vst %%v24, 0( %[x_ptr]) \n\t"
|
||||
"vst %%v25, 16( %[x_ptr]) \n\t"
|
||||
"vst %%v26, 32( %[x_ptr]) \n\t"
|
||||
"vst %%v27, 48( %[x_ptr]) \n\t"
|
||||
"vst %%v24, 64( %[x_ptr]) \n\t"
|
||||
"vst %%v25, 80( %[x_ptr]) \n\t"
|
||||
"vst %%v26, 96( %[x_ptr]) \n\t"
|
||||
"vst %%v27,112( %[x_ptr]) \n\t"
|
||||
|
||||
"la %1,128(%1) \n\t"
|
||||
"clgrjl %1,%%r0,1b \n\t"
|
||||
:
|
||||
:"r"(n),"a"(x)
|
||||
:"cc" , "memory" ,"r0","v24","v25","v26","v27"
|
||||
"la %[x_ptr],128(%[x_ptr]) \n\t"
|
||||
"clgrjl %[x_ptr],%%r0,1b \n\t"
|
||||
: [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x)
|
||||
: [n] "r"(n)
|
||||
:"cc" ,"r0","v24","v25","v26","v27"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) __attribute__ ((noinline));
|
||||
|
||||
static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) {
|
||||
|
||||
static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) {
|
||||
|
||||
BLASLONG i;
|
||||
BLASLONG inc_x2 = 2 * inc_x;
|
||||
BLASLONG inc_x3 = inc_x2 + inc_x;
|
||||
FLOAT t0, t1, t2, t3;
|
||||
FLOAT da_r = alpha[0];
|
||||
FLOAT da_i = alpha[1];
|
||||
|
||||
for (i = 0; i < n; i += 4) {
|
||||
t0 = da_r * x[0] - da_i * x[1];
|
||||
|
@ -280,7 +269,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
BLASLONG i = 0, j = 0;
|
||||
FLOAT temp0;
|
||||
FLOAT temp1;
|
||||
FLOAT alpha[2];
|
||||
|
||||
|
||||
if (inc_x != 1) {
|
||||
inc_x <<= 1;
|
||||
|
@ -373,9 +362,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
|
||||
BLASLONG n1 = n & -8;
|
||||
if (n1 > 0) {
|
||||
alpha[0] = da_r;
|
||||
alpha[1] = da_i;
|
||||
zscal_kernel_inc_8(n1, alpha, x, inc_x);
|
||||
zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x);
|
||||
j = n1;
|
||||
i = n1 * inc_x;
|
||||
}
|
||||
|
@ -401,19 +388,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
BLASLONG n1 = n & -8;
|
||||
if (n1 > 0) {
|
||||
|
||||
alpha[0] = da_r;
|
||||
alpha[1] = da_i;
|
||||
|
||||
if (da_r == 0.0)
|
||||
if (da_i == 0)
|
||||
zscal_kernel_8_zero(n1, x);
|
||||
else
|
||||
zscal_kernel_8_zero_r(n1, alpha, x);
|
||||
zscal_kernel_8_zero_r(n1, da_i, x);
|
||||
else
|
||||
if (da_i == 0)
|
||||
zscal_kernel_8_zero_i(n1, alpha, x);
|
||||
zscal_kernel_8_zero_i(n1, da_r, x);
|
||||
else
|
||||
zscal_kernel_8(n1, alpha, x);
|
||||
zscal_kernel_8(n1, da_r,da_i, x);
|
||||
|
||||
i = n1 << 1;
|
||||
j = n1;
|
||||
|
|
|
@ -29,99 +29,211 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(Z13_SWAP_A)
|
||||
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 1, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[2*n])x),
|
||||
[mem_y] "+m" (*(double (*)[2*n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
|
||||
,"v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void __attribute__ ((noinline)) zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pfd 2, 0(%1) \n\t"
|
||||
"pfd 2, 0(%2) \n\t"
|
||||
"srlg %%r0,%0,4 \n\t"
|
||||
"pfd 2, 0(%[ptr_x]) \n\t"
|
||||
"pfd 2, 0(%[ptr_y]) \n\t"
|
||||
"srlg %[n_tmp],%[n_tmp],4 \n\t"
|
||||
"xgr %%r1,%%r1 \n\t"
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"pfd 2, 256(%%r1,%1) \n\t"
|
||||
"pfd 2, 256(%%r1,%2) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
|
||||
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
"vl %%v16, 0(%%r1,%1) \n\t"
|
||||
"vl %%v17, 16(%%r1,%1) \n\t"
|
||||
"vl %%v18, 32(%%r1,%1) \n\t"
|
||||
"vl %%v19, 48(%%r1,%1) \n\t"
|
||||
"vl %%v20, 64(%%r1,%1) \n\t"
|
||||
"vl %%v21, 80(%%r1,%1) \n\t"
|
||||
"vl %%v22, 96(%%r1,%1) \n\t"
|
||||
"vl %%v23, 112(%%r1,%1) \n\t"
|
||||
"vl %%v24, 128(%%r1,%1) \n\t"
|
||||
"vl %%v25, 144(%%r1,%1) \n\t"
|
||||
"vl %%v26, 160(%%r1,%1) \n\t"
|
||||
"vl %%v27, 176(%%r1,%1) \n\t"
|
||||
"vl %%v28, 192(%%r1,%1) \n\t"
|
||||
"vl %%v29, 208(%%r1,%1) \n\t"
|
||||
"vl %%v30, 224(%%r1,%1) \n\t"
|
||||
"vl %%v31, 240(%%r1,%1) \n\t"
|
||||
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
|
||||
"vl %%v0, 0(%%r1,%2) \n\t"
|
||||
"vl %%v1, 16(%%r1,%2) \n\t"
|
||||
"vl %%v2, 32(%%r1,%2) \n\t"
|
||||
"vl %%v3, 48(%%r1,%2) \n\t"
|
||||
"vl %%v4, 64(%%r1,%2) \n\t"
|
||||
"vl %%v5, 80(%%r1,%2) \n\t"
|
||||
"vl %%v6, 96(%%r1,%2) \n\t"
|
||||
"vl %%v7, 112(%%r1,%2) \n\t"
|
||||
"vst %%v0, 0(%%r1,%1) \n\t"
|
||||
"vst %%v1, 16(%%r1,%1) \n\t"
|
||||
"vst %%v2, 32(%%r1,%1) \n\t"
|
||||
"vst %%v3, 48(%%r1,%1) \n\t"
|
||||
"vst %%v4, 64(%%r1,%1) \n\t"
|
||||
"vst %%v5, 80(%%r1,%1) \n\t"
|
||||
"vst %%v6, 96(%%r1,%1) \n\t"
|
||||
"vst %%v7, 112(%%r1,%1) \n\t"
|
||||
"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vl %%v0, 128(%%r1,%2) \n\t"
|
||||
"vl %%v1, 144(%%r1,%2) \n\t"
|
||||
"vl %%v2, 160(%%r1,%2) \n\t"
|
||||
"vl %%v3, 176(%%r1,%2) \n\t"
|
||||
"vl %%v4, 192(%%r1,%2) \n\t"
|
||||
"vl %%v5, 208(%%r1,%2) \n\t"
|
||||
"vl %%v6, 224(%%r1,%2) \n\t"
|
||||
"vl %%v7, 240(%%r1,%2) \n\t"
|
||||
"vst %%v0, 128(%%r1,%1) \n\t"
|
||||
"vst %%v1, 144(%%r1,%1) \n\t"
|
||||
"vst %%v2, 160(%%r1,%1) \n\t"
|
||||
"vst %%v3, 176(%%r1,%1) \n\t"
|
||||
"vst %%v4, 192(%%r1,%1) \n\t"
|
||||
"vst %%v5, 208(%%r1,%1) \n\t"
|
||||
"vst %%v6, 224(%%r1,%1) \n\t"
|
||||
"vst %%v7, 240(%%r1,%1) \n\t"
|
||||
"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
|
||||
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"
|
||||
|
||||
"vst %%v16, 0(%%r1,%2) \n\t"
|
||||
"vst %%v17, 16(%%r1,%2) \n\t"
|
||||
"vst %%v18, 32(%%r1,%2) \n\t"
|
||||
"vst %%v19, 48(%%r1,%2) \n\t"
|
||||
"vst %%v20, 64(%%r1,%2) \n\t"
|
||||
"vst %%v21, 80(%%r1,%2) \n\t"
|
||||
"vst %%v22, 96(%%r1,%2) \n\t"
|
||||
"vst %%v23, 112(%%r1,%2) \n\t"
|
||||
"vst %%v24, 128(%%r1,%2) \n\t"
|
||||
"vst %%v25, 144(%%r1,%2) \n\t"
|
||||
"vst %%v26, 160(%%r1,%2) \n\t"
|
||||
"vst %%v27, 176(%%r1,%2) \n\t"
|
||||
"vst %%v28, 192(%%r1,%2) \n\t"
|
||||
"vst %%v29, 208(%%r1,%2) \n\t"
|
||||
"vst %%v30, 224(%%r1,%2) \n\t"
|
||||
"vst %%v31, 240(%%r1,%2) \n\t"
|
||||
"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
|
||||
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
|
||||
|
||||
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %%r0,1b"
|
||||
:
|
||||
: "r"(n), "a"(x), "a"(y)
|
||||
:"cc", "memory","r0","r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"la %%r1,256(%%r1) \n\t"
|
||||
"brctg %[n_tmp],1b"
|
||||
: [mem_x] "+m" (*(double (*)[2*n])x),
|
||||
[mem_y] "+m" (*(double (*)[2*n])y),
|
||||
[n_tmp] "+&r"(n)
|
||||
: [ptr_x] "a"(x), [ptr_y] "a"(y)
|
||||
: "cc", "memory", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
|
||||
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||
);
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue