diff --git a/kernel/generic/trmm_ltcopy_2.c b/kernel/generic/trmm_ltcopy_2.c index 60cdeed1c..e9ad45fa0 100644 --- a/kernel/generic/trmm_ltcopy_2.c +++ b/kernel/generic/trmm_ltcopy_2.c @@ -116,22 +116,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (m & 1) { if (X > posY) { - /* ao1 += 1; - ao2 += 1; */ + ao1 += 1; + ao2 += 1; b += 2; } else -#ifdef UNIT if (X < posY) { -#endif - b[ 0] = *(ao1 + 0); -#ifdef UNIT + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; } else { +#ifdef UNIT + data02 = *(ao1 + 1); b[ 0] = ONE; + b[ 1] = data02; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += 2; + b += 2; } -#endif - b[ 1] = *(ao1 + 1); - b += 2; } posY += 2; @@ -178,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } while (i > 0); } - // posY += 1; + posY += 1; } return 0; diff --git a/kernel/generic/trmm_utcopy_16.c b/kernel/generic/trmm_utcopy_16.c index 12642e7db..b83989f55 100644 --- a/kernel/generic/trmm_utcopy_16.c +++ b/kernel/generic/trmm_utcopy_16.c @@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 15); if (i > 0) { if (X < posY) { - /* a01 += i; + a01 += i; a02 += i; a03 += i; a04 += i; @@ -533,7 +533,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a13 += i; a14 += i; a15 += i; - a16 += i; */ + a16 += i; b += 16 * i; } else if (X > posY) { @@ -1130,14 +1130,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 7); if (i > 0) { if (X < posY) { - /* a01 += i; + a01 += i; a02 += i; a03 += i; a04 += i; a05 += i; a06 += i; a07 += i; - a08 += i; */ + a08 += i; b += 8 * i; } else if (X > posY) { @@ -1156,13 +1156,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 8; } - /* a02 += i * lda; + a02 += i * lda; a03 += i * lda; a04 += i * lda; a05 += i * lda; a06 += i * lda; a07 += i * lda; - a08 += i * lda; */ + a08 += i * lda; } else { #ifdef UNIT b[ 0] = ONE; @@ -1371,10 +1371,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i > 0) { if (X < posY) { - /* a01 += i; + a01 += i; a02 += i; a03 += i; - a04 += i; */ + a04 += i; b += 4 * i; } else if (X > posY) { @@ -1387,9 +1387,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 += lda; b += 4; } - /* a02 += lda; + a02 += lda; a03 += lda; - a04 += lda; */ + a04 += lda; } else { #ifdef UNIT @@ -1487,19 +1487,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X < posY) { a01 ++; a02 ++; - } else { -#ifdef UNIT + b += 2; + } else if (X > posY) { -#endif b[ 0] = *(a01 + 0); -#ifdef UNIT + b[ 1] = *(a01 + 1); + a01 += lda; + b += 2; } else { +#ifdef UNIT b[ 0] = ONE; - } + b[ 1] = *(a01 + 1); +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); #endif - b[ 1] = *(a01 + 1); - } - b += 2; + b += 2; + } } posY += 2; } @@ -1518,25 +1522,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (i > 0) { do { if (X < posY) { - a01 ++; - } else { -#ifdef UNIT + a01 += 1; + b ++; + } else if (X > posY) { -#endif b[ 0] = *(a01 + 0); -#ifdef UNIT + a01 += lda; + b ++; } else { +#ifdef UNIT b[ 0] = ONE; - } +#else + b[ 0] = *(a01 + 0); #endif - a01 += lda; - } - b ++; - X ++; - i --; + a01 += lda; + b ++; + } + + X += 1; + i --; } while (i > 0); } - // posY += 1; + posY += 1; } return 0; diff --git a/kernel/generic/trmm_utcopy_2.c b/kernel/generic/trmm_utcopy_2.c index 75076c382..ae4a19e32 100644 --- a/kernel/generic/trmm_utcopy_2.c +++ b/kernel/generic/trmm_utcopy_2.c @@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (m & 1) { if (X < posY) { - /* ao1 += 1; - ao2 += 1; */ + ao1 += 1; + ao2 += 1; b += 2; } else if (X > posY) { @@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; b[ 1] = data02; - // ao1 += lda; + ao1 += lda; b += 2; } else { #ifdef UNIT @@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; b[ 1] = ZERO; #endif - // ao1 += lda; + ao1 += lda; b += 2; } } @@ -161,18 +161,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = m; if (m > 0) { do { + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += lda; + } else { #ifdef UNIT - if (X > posY) { + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - b[ 0] = *(ao1 + 0); -#ifdef UNIT - } else { - b[ 0] = ONE; - } -#endif - b ++; - ao1 += lda; - X ++; + b += 1; + ao1 += lda; + } + + X += 1; i --; } while (i > 0); } diff --git a/kernel/generic/trmm_utcopy_4.c b/kernel/generic/trmm_utcopy_4.c index e5844094e..441f7338b 100644 --- a/kernel/generic/trmm_utcopy_4.c +++ b/kernel/generic/trmm_utcopy_4.c @@ -201,18 +201,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X < posY) { if (m & 2) { - /* ao1 += 2; + ao1 += 2; ao2 += 2; ao3 += 2; - ao4 += 2; */ + ao4 += 2; b += 8; } if (m & 1) { - /* ao1 += 1; + ao1 += 1; ao2 += 1; ao3 += 1; - ao4 += 1; */ + ao4 += 1; b += 4; } @@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = data08; ao1 += 2 * lda; - // ao2 += 2 * lda; + ao2 += 2 * lda; b += 8; } @@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = data03; b[ 3] = data04; - // ao1 += lda; + ao1 += lda; b += 4; } @@ -401,7 +401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (i) { if (X < posY) { - // ao1 += 2; + ao1 += 2; b += 2; } else if (X > posY) { @@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; b[ 1] = data02; - // ao1 += lda; + ao1 += lda; b += 2; } else { #ifdef UNIT @@ -443,21 +443,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X < posY) { + b += 1; ao1 += 1; - } else { -#ifdef UNIT + } else if (X > posY) { -#endif - b[ 0] = *(ao1 + 0); -#ifdef UNIT + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; } else { +#ifdef UNIT b[ 0] = ONE; - } +#else + data01 = *(ao1 + 0); + b[ 0] = data01; #endif - ao1 += lda; - } - b ++; - X ++; + ao1 += lda; + b += 1; + } + + X += 1; i --; } while (i > 0); } diff --git a/kernel/generic/trsm_ltcopy_4.c b/kernel/generic/trsm_ltcopy_4.c index 07bb137d4..12043eb33 100644 --- a/kernel/generic/trsm_ltcopy_4.c +++ b/kernel/generic/trsm_ltcopy_4.c @@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } a1 += 2 * lda; - // a2 += 2 * lda; + a2 += 2 * lda; b += 8; ii += 2; diff --git a/kernel/generic/ztrmm_ltcopy_2.c b/kernel/generic/ztrmm_ltcopy_2.c index 7969f4f3d..457890ceb 100644 --- a/kernel/generic/ztrmm_ltcopy_2.c +++ b/kernel/generic/ztrmm_ltcopy_2.c @@ -139,18 +139,48 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { -#ifdef UNIT + + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else if (X < posY) { -#endif - b[ 0] = *(ao1 + 0); - b[ 1] = *(ao1 + 1); -#ifdef UNIT + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + + ao1 += lda; + b += 4; } else { +#ifdef UNIT + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + b[ 0] = ONE; b[ 1] = ZERO; - } + b[ 2] = data3; + b[ 3] = data4; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; #endif - b += 4; + b += 4; + } } posY += 2; @@ -203,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } while (i > 0); } - // posY += 1; + posY += 1; } return 0; diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c index 0e33a7d18..08f85e891 100644 --- a/kernel/generic/ztrsm_utcopy_1.c +++ b/kernel/generic/ztrsm_utcopy_1.c @@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01 = 0.0, data02 = 0.0; + FLOAT data01, data02; FLOAT *a1; lda *= 2; diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c index c34d741ee..387bb2532 100644 --- a/kernel/generic/ztrsm_utcopy_2.c +++ b/kernel/generic/ztrsm_utcopy_2.c @@ -43,8 +43,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01 = 0.0, data02 = 0.0, data03, data04; - FLOAT data05, data06, data07 = 0.0, data08 = 0.0; + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; FLOAT *a1, *a2; lda *= 2;