diff --git a/kernel/generic/zhemm_ltcopy_16.c b/kernel/generic/zhemm_ltcopy_16.c new file mode 100644 index 000000000..8797891ea --- /dev/null +++ b/kernel/generic/zhemm_ltcopy_16.c @@ -0,0 +1,1170 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda; + if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda; + if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda; + if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda; + if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda; + if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda; + if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda; + if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + if (offset > -8) ao9 += lda; else ao9 += 2; + if (offset > -9) ao10 += lda; else ao10 += 2; + if (offset > -10) ao11 += lda; else ao11 += 2; + if (offset > -11) ao12 += lda; else ao12 += 2; + if (offset > -12) ao13 += lda; else ao13 += 2; + if (offset > -13) ao14 += lda; else ao14 += 2; + if (offset > -14) ao15 += lda; else ao15 += 2; + if (offset > -15) ao16 += lda; else ao16 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + } else + if (offset < -15) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -4 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -5 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -6 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -7 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = ZERO; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -8 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = ZERO; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -9 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = ZERO; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -10 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = ZERO; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -11 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = ZERO; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -12 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = ZERO; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -13 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = ZERO; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + break; + case -14 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = ZERO; + b[30] = data31; + b[31] = data32; + break; + case -15 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = ZERO; + break; + } + } + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + } else + if (offset < -7) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -4 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -5 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -6 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = data16; + break; + case -7 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = ZERO; + break; + } + } + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm_utcopy_16.c b/kernel/generic/zhemm_utcopy_16.c new file mode 100644 index 000000000..822483a83 --- /dev/null +++ b/kernel/generic/zhemm_utcopy_16.c @@ -0,0 +1,1168 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda; + if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda; + if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda; + if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda; + if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda; + if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda; + if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda; + if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + if (offset > -8) ao9 += 2; else ao9 += lda; + if (offset > -9) ao10 += 2; else ao10 += lda; + if (offset > -10) ao11 += 2; else ao11 += lda; + if (offset > -11) ao12 += 2; else ao12 += lda; + if (offset > -12) ao13 += 2; else ao13 += lda; + if (offset > -13) ao14 += 2; else ao14 += lda; + if (offset > -14) ao15 += 2; else ao15 += lda; + if (offset > -15) ao16 += 2; else ao16 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + } else + if (offset < -15) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -4 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -5 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -6 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = -data16; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -7 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ZERO; + b[16] = data17; + b[17] = -data18; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -8 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = ZERO; + b[18] = data19; + b[19] = -data20; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -9 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = ZERO; + b[20] = data21; + b[21] = -data22; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -10 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = ZERO; + b[22] = data23; + b[23] = -data24; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -11 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = ZERO; + b[24] = data25; + b[25] = -data26; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -12 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = ZERO; + b[26] = data27; + b[27] = -data28; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -13 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = ZERO; + b[28] = data29; + b[29] = -data30; + b[30] = data31; + b[31] = -data32; + break; + case -14 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = ZERO; + b[30] = data31; + b[31] = -data32; + break; + case -15 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = ZERO; + break; + } + } + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + } else + if (offset < -7) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -4 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -5 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -6 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = -data16; + break; + case -7 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ZERO; + break; + } + } + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zneg_tcopy_16.c b/kernel/generic/zneg_tcopy_16.c new file mode 100644 index 000000000..50f5a3d37 --- /dev/null +++ b/kernel/generic/zneg_tcopy_16.c @@ -0,0 +1,587 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + FLOAT ctemp33, ctemp34, ctemp35, ctemp36; + FLOAT ctemp37, ctemp38, ctemp39, ctemp40; + FLOAT ctemp41, ctemp42, ctemp43, ctemp44; + FLOAT ctemp45, ctemp46, ctemp47, ctemp48; + FLOAT ctemp49, ctemp50, ctemp51, ctemp52; + FLOAT ctemp53, ctemp54, ctemp55, ctemp56; + FLOAT ctemp57, ctemp58, ctemp59, ctemp60; + FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + + aoffset = a; + boffset = b; + lda *= 2; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 32; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + ctemp17 = *(aoffset1 + 16); + ctemp18 = *(aoffset1 + 17); + ctemp19 = *(aoffset1 + 18); + ctemp20 = *(aoffset1 + 19); + ctemp21 = *(aoffset1 + 20); + ctemp22 = *(aoffset1 + 21); + ctemp23 = *(aoffset1 + 22); + ctemp24 = *(aoffset1 + 23); + ctemp25 = *(aoffset1 + 24); + ctemp26 = *(aoffset1 + 25); + ctemp27 = *(aoffset1 + 26); + ctemp28 = *(aoffset1 + 27); + ctemp29 = *(aoffset1 + 28); + ctemp30 = *(aoffset1 + 29); + ctemp31 = *(aoffset1 + 30); + ctemp32 = *(aoffset1 + 31); + + ctemp33 = *(aoffset2 + 0); + ctemp34 = *(aoffset2 + 1); + ctemp35 = *(aoffset2 + 2); + ctemp36 = *(aoffset2 + 3); + ctemp37 = *(aoffset2 + 4); + ctemp38 = *(aoffset2 + 5); + ctemp39 = *(aoffset2 + 6); + ctemp40 = *(aoffset2 + 7); + ctemp41 = *(aoffset2 + 8); + ctemp42 = *(aoffset2 + 9); + ctemp43 = *(aoffset2 + 10); + ctemp44 = *(aoffset2 + 11); + ctemp45 = *(aoffset2 + 12); + ctemp46 = *(aoffset2 + 13); + ctemp47 = *(aoffset2 + 14); + ctemp48 = *(aoffset2 + 15); + ctemp49 = *(aoffset2 + 16); + ctemp50 = *(aoffset2 + 17); + ctemp51 = *(aoffset2 + 18); + ctemp52 = *(aoffset2 + 19); + ctemp53 = *(aoffset2 + 20); + ctemp54 = *(aoffset2 + 21); + ctemp55 = *(aoffset2 + 22); + ctemp56 = *(aoffset2 + 23); + ctemp57 = *(aoffset2 + 24); + ctemp58 = *(aoffset2 + 25); + ctemp59 = *(aoffset2 + 26); + ctemp60 = *(aoffset2 + 27); + ctemp61 = *(aoffset2 + 28); + ctemp62 = *(aoffset2 + 29); + ctemp63 = *(aoffset2 + 30); + ctemp64 = *(aoffset2 + 31); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + *(boffset + 32) = -ctemp33; + *(boffset + 33) = -ctemp34; + *(boffset + 34) = -ctemp35; + *(boffset + 35) = -ctemp36; + *(boffset + 36) = -ctemp37; + *(boffset + 37) = -ctemp38; + *(boffset + 38) = -ctemp39; + *(boffset + 39) = -ctemp40; + + *(boffset + 40) = -ctemp41; + *(boffset + 41) = -ctemp42; + *(boffset + 42) = -ctemp43; + *(boffset + 43) = -ctemp44; + *(boffset + 44) = -ctemp45; + *(boffset + 45) = -ctemp46; + *(boffset + 46) = -ctemp47; + *(boffset + 47) = -ctemp48; + + *(boffset + 48) = -ctemp49; + *(boffset + 49) = -ctemp50; + *(boffset + 50) = -ctemp51; + *(boffset + 51) = -ctemp52; + *(boffset + 52) = -ctemp53; + *(boffset + 53) = -ctemp54; + *(boffset + 54) = -ctemp55; + *(boffset + 55) = -ctemp56; + + *(boffset + 56) = -ctemp57; + *(boffset + 57) = -ctemp58; + *(boffset + 58) = -ctemp59; + *(boffset + 59) = -ctemp60; + *(boffset + 60) = -ctemp61; + *(boffset + 61) = -ctemp62; + *(boffset + 62) = -ctemp63; + *(boffset + 63) = -ctemp64; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 64; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + ctemp17 = *(aoffset1 + 16); + ctemp18 = *(aoffset1 + 17); + ctemp19 = *(aoffset1 + 18); + ctemp20 = *(aoffset1 + 19); + ctemp21 = *(aoffset1 + 20); + ctemp22 = *(aoffset1 + 21); + ctemp23 = *(aoffset1 + 22); + ctemp24 = *(aoffset1 + 23); + ctemp25 = *(aoffset1 + 24); + ctemp26 = *(aoffset1 + 25); + ctemp27 = *(aoffset1 + 26); + ctemp28 = *(aoffset1 + 27); + ctemp29 = *(aoffset1 + 28); + ctemp30 = *(aoffset1 + 29); + ctemp31 = *(aoffset1 + 30); + ctemp32 = *(aoffset1 + 31); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + boffset += 32; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + boffset += 16; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + boffset += 8; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + boffset += 4; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + // aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + // boffset += 2; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm_lcopy_16.c b/kernel/generic/zsymm_lcopy_16.c new file mode 100644 index 000000000..b32374a5e --- /dev/null +++ b/kernel/generic/zsymm_lcopy_16.c @@ -0,0 +1,333 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda; + if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda; + if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda; + if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda; + if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda; + if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda; + if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda; + if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda; + + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + if (offset > -8) ao9 += lda; else ao9 += 2; + if (offset > -9) ao10 += lda; else ao10 += 2; + if (offset > -10) ao11 += lda; else ao11 += 2; + if (offset > -11) ao12 += lda; else ao12 += 2; + if (offset > -12) ao13 += lda; else ao13 += 2; + if (offset > -13) ao14 += lda; else ao14 += 2; + if (offset > -14) ao15 += lda; else ao15 += 2; + if (offset > -15) ao16 += lda; else ao16 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_ucopy_16.c b/kernel/generic/zsymm_ucopy_16.c new file mode 100644 index 000000000..cb19bea47 --- /dev/null +++ b/kernel/generic/zsymm_ucopy_16.c @@ -0,0 +1,332 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + lda *= 2; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda; + if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda; + if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda; + if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda; + if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda; + if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda; + if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda; + if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + data17 = *(ao9 + 0); + data18 = *(ao9 + 1); + data19 = *(ao10 + 0); + data20 = *(ao10 + 1); + data21 = *(ao11 + 0); + data22 = *(ao11 + 1); + data23 = *(ao12 + 0); + data24 = *(ao12 + 1); + data25 = *(ao13 + 0); + data26 = *(ao13 + 1); + data27 = *(ao14 + 0); + data28 = *(ao14 + 1); + data29 = *(ao15 + 0); + data30 = *(ao15 + 1); + data31 = *(ao16 + 0); + data32 = *(ao16 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + if (offset > -8) ao9 += 2; else ao9 += lda; + if (offset > -9) ao10 += 2; else ao10 += lda; + if (offset > -10) ao11 += 2; else ao11 += lda; + if (offset > -11) ao12 += 2; else ao12 += lda; + if (offset > -12) ao13 += 2; else ao13 += lda; + if (offset > -13) ao14 += 2; else ao14 += lda; + if (offset > -14) ao15 += 2; else ao15 += lda; + if (offset > -15) ao16 += 2; else ao16 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b += 32; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/ztrmm_lncopy_16.c b/kernel/generic/ztrmm_lncopy_16.c new file mode 100644 index 000000000..d7fb23176 --- /dev/null +++ b/kernel/generic/ztrmm_lncopy_16.c @@ -0,0 +1,2310 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X < posY) { + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + b += 512; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a01 + 2); + b[ 33] = *(a01 + 3); +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = ZERO; + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; + b[ 54] = ZERO; + b[ 55] = ZERO; + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a01 + 4); + b[ 65] = *(a01 + 5); + b[ 66] = *(a02 + 4); + b[ 67] = *(a02 + 5); +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = ZERO; + b[ 71] = ZERO; + b[ 72] = ZERO; + b[ 73] = ZERO; + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; + b[ 90] = ZERO; + b[ 91] = ZERO; + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a01 + 6); + b[ 97] = *(a01 + 7); + b[ 98] = *(a02 + 6); + b[ 99] = *(a02 + 7); + b[100] = *(a03 + 6); + b[101] = *(a03 + 7); +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; + b[108] = ZERO; + b[109] = ZERO; + b[110] = ZERO; + b[111] = ZERO; + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; + b[126] = ZERO; + b[127] = ZERO; + + b[128] = *(a01 + 8); + b[129] = *(a01 + 9); + b[130] = *(a02 + 8); + b[131] = *(a02 + 9); + b[132] = *(a03 + 8); + b[133] = *(a03 + 9); + b[134] = *(a04 + 8); + b[135] = *(a04 + 9); +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = ZERO; + b[139] = ZERO; + b[140] = ZERO; + b[141] = ZERO; + b[142] = ZERO; + b[143] = ZERO; + b[144] = ZERO; + b[145] = ZERO; + b[146] = ZERO; + b[147] = ZERO; + b[148] = ZERO; + b[149] = ZERO; + b[150] = ZERO; + b[151] = ZERO; + b[152] = ZERO; + b[153] = ZERO; + b[154] = ZERO; + b[155] = ZERO; + b[156] = ZERO; + b[157] = ZERO; + b[158] = ZERO; + b[159] = ZERO; + + b[160] = *(a01 + 10); + b[161] = *(a01 + 11); + b[162] = *(a02 + 10); + b[163] = *(a02 + 11); + b[164] = *(a03 + 10); + b[165] = *(a03 + 11); + b[166] = *(a04 + 10); + b[167] = *(a04 + 11); + b[168] = *(a05 + 10); + b[169] = *(a05 + 11); +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = ZERO; + b[173] = ZERO; + b[174] = ZERO; + b[175] = ZERO; + b[176] = ZERO; + b[177] = ZERO; + b[178] = ZERO; + b[179] = ZERO; + b[180] = ZERO; + b[181] = ZERO; + b[182] = ZERO; + b[183] = ZERO; + b[184] = ZERO; + b[185] = ZERO; + b[186] = ZERO; + b[187] = ZERO; + b[188] = ZERO; + b[189] = ZERO; + b[190] = ZERO; + b[191] = ZERO; + + b[192] = *(a01 + 12); + b[193] = *(a01 + 13); + b[194] = *(a02 + 12); + b[195] = *(a02 + 13); + b[196] = *(a03 + 12); + b[197] = *(a03 + 13); + b[198] = *(a04 + 12); + b[199] = *(a04 + 13); + b[200] = *(a05 + 12); + b[201] = *(a05 + 13); + b[202] = *(a06 + 12); + b[203] = *(a06 + 13); +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = ZERO; + b[207] = ZERO; + b[208] = ZERO; + b[209] = ZERO; + b[210] = ZERO; + b[211] = ZERO; + b[212] = ZERO; + b[213] = ZERO; + b[214] = ZERO; + b[215] = ZERO; + b[216] = ZERO; + b[217] = ZERO; + b[218] = ZERO; + b[219] = ZERO; + b[220] = ZERO; + b[221] = ZERO; + b[222] = ZERO; + b[223] = ZERO; + + b[224] = *(a01 + 14); + b[225] = *(a01 + 15); + b[226] = *(a02 + 14); + b[227] = *(a02 + 15); + b[228] = *(a03 + 14); + b[229] = *(a03 + 15); + b[230] = *(a04 + 14); + b[231] = *(a04 + 15); + b[232] = *(a05 + 14); + b[233] = *(a05 + 15); + b[234] = *(a06 + 14); + b[235] = *(a06 + 15); + b[236] = *(a07 + 14); + b[237] = *(a07 + 15); +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = ZERO; + b[241] = ZERO; + b[242] = ZERO; + b[243] = ZERO; + b[244] = ZERO; + b[245] = ZERO; + b[246] = ZERO; + b[247] = ZERO; + b[248] = ZERO; + b[249] = ZERO; + b[250] = ZERO; + b[251] = ZERO; + b[252] = ZERO; + b[253] = ZERO; + b[254] = ZERO; + b[255] = ZERO; + + b[256] = *(a01 + 16); + b[257] = *(a01 + 17); + b[258] = *(a02 + 16); + b[259] = *(a02 + 17); + b[260] = *(a03 + 16); + b[261] = *(a03 + 17); + b[262] = *(a04 + 16); + b[263] = *(a04 + 17); + b[264] = *(a05 + 16); + b[265] = *(a05 + 17); + b[266] = *(a06 + 16); + b[267] = *(a06 + 17); + b[268] = *(a07 + 16); + b[269] = *(a07 + 17); + b[270] = *(a08 + 16); + b[271] = *(a08 + 17); +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = ZERO; + b[275] = ZERO; + b[276] = ZERO; + b[277] = ZERO; + b[278] = ZERO; + b[279] = ZERO; + b[280] = ZERO; + b[281] = ZERO; + b[282] = ZERO; + b[283] = ZERO; + b[284] = ZERO; + b[285] = ZERO; + b[286] = ZERO; + b[287] = ZERO; + + b[288] = *(a01 + 18); + b[289] = *(a01 + 19); + b[290] = *(a02 + 18); + b[291] = *(a02 + 19); + b[292] = *(a03 + 18); + b[293] = *(a03 + 19); + b[294] = *(a04 + 18); + b[295] = *(a04 + 19); + b[296] = *(a05 + 18); + b[297] = *(a05 + 19); + b[298] = *(a06 + 18); + b[299] = *(a06 + 19); + b[300] = *(a07 + 18); + b[301] = *(a07 + 19); + b[302] = *(a08 + 18); + b[303] = *(a08 + 19); + b[304] = *(a09 + 18); + b[305] = *(a09 + 19); +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = ZERO; + b[309] = ZERO; + b[310] = ZERO; + b[311] = ZERO; + b[312] = ZERO; + b[313] = ZERO; + b[314] = ZERO; + b[315] = ZERO; + b[316] = ZERO; + b[317] = ZERO; + b[318] = ZERO; + b[319] = ZERO; + + b[320] = *(a01 + 20); + b[321] = *(a01 + 21); + b[322] = *(a02 + 20); + b[323] = *(a02 + 21); + b[324] = *(a03 + 20); + b[325] = *(a03 + 21); + b[326] = *(a04 + 20); + b[327] = *(a04 + 21); + b[328] = *(a05 + 20); + b[329] = *(a05 + 21); + b[330] = *(a06 + 20); + b[331] = *(a06 + 21); + b[332] = *(a07 + 20); + b[333] = *(a07 + 21); + b[334] = *(a08 + 20); + b[335] = *(a08 + 21); + b[336] = *(a09 + 20); + b[337] = *(a09 + 21); + b[338] = *(a10 + 20); + b[339] = *(a10 + 21); +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = ZERO; + b[343] = ZERO; + b[344] = ZERO; + b[345] = ZERO; + b[346] = ZERO; + b[347] = ZERO; + b[348] = ZERO; + b[349] = ZERO; + b[350] = ZERO; + b[351] = ZERO; + + b[352] = *(a01 + 22); + b[353] = *(a01 + 23); + b[354] = *(a02 + 22); + b[355] = *(a02 + 23); + b[356] = *(a03 + 22); + b[357] = *(a03 + 23); + b[358] = *(a04 + 22); + b[359] = *(a04 + 23); + b[360] = *(a05 + 22); + b[361] = *(a05 + 23); + b[362] = *(a06 + 22); + b[363] = *(a06 + 23); + b[364] = *(a07 + 22); + b[365] = *(a07 + 23); + b[366] = *(a08 + 22); + b[367] = *(a08 + 23); + b[368] = *(a09 + 22); + b[369] = *(a09 + 23); + b[370] = *(a10 + 22); + b[371] = *(a10 + 23); + b[372] = *(a11 + 22); + b[373] = *(a11 + 23); +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = ZERO; + b[377] = ZERO; + b[378] = ZERO; + b[379] = ZERO; + b[380] = ZERO; + b[381] = ZERO; + b[382] = ZERO; + b[383] = ZERO; + + b[384] = *(a01 + 24); + b[385] = *(a01 + 25); + b[386] = *(a02 + 24); + b[387] = *(a02 + 25); + b[388] = *(a03 + 24); + b[389] = *(a03 + 25); + b[390] = *(a04 + 24); + b[391] = *(a04 + 25); + b[392] = *(a05 + 24); + b[393] = *(a05 + 25); + b[394] = *(a06 + 24); + b[395] = *(a06 + 25); + b[396] = *(a07 + 24); + b[397] = *(a07 + 25); + b[398] = *(a08 + 24); + b[399] = *(a08 + 25); + b[400] = *(a09 + 24); + b[401] = *(a09 + 25); + b[402] = *(a10 + 24); + b[403] = *(a10 + 25); + b[404] = *(a11 + 24); + b[405] = *(a11 + 25); + b[406] = *(a12 + 24); + b[407] = *(a12 + 25); +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = ZERO; + b[411] = ZERO; + b[412] = ZERO; + b[413] = ZERO; + b[414] = ZERO; + b[415] = ZERO; + + b[416] = *(a01 + 26); + b[417] = *(a01 + 27); + b[418] = *(a02 + 26); + b[419] = *(a02 + 27); + b[420] = *(a03 + 26); + b[421] = *(a03 + 27); + b[422] = *(a04 + 26); + b[423] = *(a04 + 27); + b[424] = *(a05 + 26); + b[425] = *(a05 + 27); + b[426] = *(a06 + 26); + b[427] = *(a06 + 27); + b[428] = *(a07 + 26); + b[429] = *(a07 + 27); + b[430] = *(a08 + 26); + b[431] = *(a08 + 27); + b[432] = *(a09 + 26); + b[433] = *(a09 + 27); + b[434] = *(a10 + 26); + b[435] = *(a10 + 27); + b[436] = *(a11 + 26); + b[437] = *(a11 + 27); + b[438] = *(a12 + 26); + b[439] = *(a12 + 27); + b[440] = *(a13 + 26); + b[441] = *(a13 + 27); +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = ZERO; + b[445] = ZERO; + b[446] = ZERO; + b[447] = ZERO; + + b[448] = *(a01 + 28); + b[449] = *(a01 + 29); + b[450] = *(a02 + 28); + b[451] = *(a02 + 29); + b[452] = *(a03 + 28); + b[453] = *(a03 + 29); + b[454] = *(a04 + 28); + b[455] = *(a04 + 29); + b[456] = *(a05 + 28); + b[457] = *(a05 + 29); + b[458] = *(a06 + 28); + b[459] = *(a06 + 29); + b[460] = *(a07 + 28); + b[461] = *(a07 + 29); + b[462] = *(a08 + 28); + b[463] = *(a08 + 29); + b[464] = *(a09 + 28); + b[465] = *(a09 + 29); + b[466] = *(a10 + 28); + b[467] = *(a10 + 29); + b[468] = *(a11 + 28); + b[469] = *(a11 + 29); + b[470] = *(a12 + 28); + b[471] = *(a12 + 29); + b[472] = *(a13 + 28); + b[473] = *(a13 + 29); + b[474] = *(a14 + 28); + b[475] = *(a14 + 29); +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = ZERO; + b[479] = ZERO; + + b[480] = *(a01 + 30); + b[481] = *(a01 + 31); + b[482] = *(a02 + 30); + b[483] = *(a02 + 31); + b[484] = *(a03 + 30); + b[485] = *(a03 + 31); + b[486] = *(a04 + 30); + b[487] = *(a04 + 31); + b[488] = *(a05 + 30); + b[489] = *(a05 + 31); + b[490] = *(a06 + 30); + b[491] = *(a06 + 31); + b[492] = *(a07 + 30); + b[493] = *(a07 + 31); + b[494] = *(a08 + 30); + b[495] = *(a08 + 31); + b[496] = *(a09 + 30); + b[497] = *(a09 + 31); + b[498] = *(a10 + 30); + b[499] = *(a10 + 31); + b[500] = *(a11 + 30); + b[501] = *(a11 + 31); + b[502] = *(a12 + 30); + b[503] = *(a12 + 31); + b[504] = *(a13 + 30); + b[505] = *(a13 + 31); + b[506] = *(a14 + 30); + b[507] = *(a14 + 31); + b[508] = *(a15 + 30); + b[509] = *(a15 + 31); +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X < posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; */ + b += 32 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + + if (i >= 2) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a01 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 3) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a01 + 5); + b[ 2] = *(a02 + 4); + b[ 3] = *(a02 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 4) { + b[ 0] = *(a01 + 6); + b[ 1] = *(a01 + 7); + b[ 2] = *(a02 + 6); + b[ 3] = *(a02 + 7); + b[ 4] = *(a03 + 6); + b[ 5] = *(a03 + 7); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 5) { + b[ 0] = *(a01 + 8); + b[ 1] = *(a01 + 9); + b[ 2] = *(a02 + 8); + b[ 3] = *(a02 + 9); + b[ 4] = *(a03 + 8); + b[ 5] = *(a03 + 9); + b[ 6] = *(a04 + 8); + b[ 7] = *(a04 + 9); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 6) { + b[ 0] = *(a01 + 10); + b[ 1] = *(a01 + 11); + b[ 2] = *(a02 + 10); + b[ 3] = *(a02 + 11); + b[ 4] = *(a03 + 10); + b[ 5] = *(a03 + 11); + b[ 6] = *(a04 + 10); + b[ 7] = *(a04 + 11); + b[ 8] = *(a05 + 10); + b[ 9] = *(a05 + 11); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 7) { + b[ 0] = *(a01 + 12); + b[ 1] = *(a01 + 13); + b[ 2] = *(a02 + 12); + b[ 3] = *(a02 + 13); + b[ 4] = *(a03 + 12); + b[ 5] = *(a03 + 13); + b[ 6] = *(a04 + 12); + b[ 7] = *(a04 + 13); + b[ 8] = *(a05 + 12); + b[ 9] = *(a05 + 13); + b[10] = *(a06 + 12); + b[11] = *(a06 + 13); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 8) { + b[ 0] = *(a01 + 14); + b[ 1] = *(a01 + 15); + b[ 2] = *(a02 + 14); + b[ 3] = *(a02 + 15); + b[ 4] = *(a03 + 14); + b[ 5] = *(a03 + 15); + b[ 6] = *(a04 + 14); + b[ 7] = *(a04 + 15); + b[ 8] = *(a05 + 14); + b[ 9] = *(a05 + 15); + b[ 10] = *(a06 + 14); + b[ 11] = *(a06 + 15); + b[ 12] = *(a07 + 14); + b[ 13] = *(a07 + 15); +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 9) { + b[ 0] = *(a01 + 16); + b[ 1] = *(a01 + 17); + b[ 2] = *(a02 + 16); + b[ 3] = *(a02 + 17); + b[ 4] = *(a03 + 16); + b[ 5] = *(a03 + 17); + b[ 6] = *(a04 + 16); + b[ 7] = *(a04 + 17); + b[ 8] = *(a05 + 16); + b[ 9] = *(a05 + 17); + b[ 10] = *(a06 + 16); + b[ 11] = *(a06 + 17); + b[ 12] = *(a07 + 16); + b[ 13] = *(a07 + 17); + b[ 14] = *(a08 + 16); + b[ 15] = *(a08 + 17); +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 10) { + b[ 0] = *(a01 + 18); + b[ 1] = *(a01 + 19); + b[ 2] = *(a02 + 18); + b[ 3] = *(a02 + 19); + b[ 4] = *(a03 + 18); + b[ 5] = *(a03 + 19); + b[ 6] = *(a04 + 18); + b[ 7] = *(a04 + 19); + b[ 8] = *(a05 + 18); + b[ 9] = *(a05 + 19); + b[ 10] = *(a06 + 18); + b[ 11] = *(a06 + 19); + b[ 12] = *(a07 + 18); + b[ 13] = *(a07 + 19); + b[ 14] = *(a08 + 18); + b[ 15] = *(a08 + 19); + b[ 16] = *(a09 + 18); + b[ 17] = *(a09 + 19); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 11) { + b[ 0] = *(a01 + 20); + b[ 1] = *(a01 + 21); + b[ 2] = *(a02 + 20); + b[ 3] = *(a02 + 21); + b[ 4] = *(a03 + 20); + b[ 5] = *(a03 + 21); + b[ 6] = *(a04 + 20); + b[ 7] = *(a04 + 21); + b[ 8] = *(a05 + 20); + b[ 9] = *(a05 + 21); + b[ 10] = *(a06 + 20); + b[ 11] = *(a06 + 21); + b[ 12] = *(a07 + 20); + b[ 13] = *(a07 + 21); + b[ 14] = *(a08 + 20); + b[ 15] = *(a08 + 21); + b[ 16] = *(a09 + 20); + b[ 17] = *(a09 + 21); + b[ 18] = *(a10 + 20); + b[ 19] = *(a10 + 21); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 12) { + b[ 0] = *(a01 + 22); + b[ 1] = *(a01 + 23); + b[ 2] = *(a02 + 22); + b[ 3] = *(a02 + 23); + b[ 4] = *(a03 + 22); + b[ 5] = *(a03 + 23); + b[ 6] = *(a04 + 22); + b[ 7] = *(a04 + 23); + b[ 8] = *(a05 + 22); + b[ 9] = *(a05 + 23); + b[ 10] = *(a06 + 22); + b[ 11] = *(a06 + 23); + b[ 12] = *(a07 + 22); + b[ 13] = *(a07 + 23); + b[ 14] = *(a08 + 22); + b[ 15] = *(a08 + 23); + b[ 16] = *(a09 + 22); + b[ 17] = *(a09 + 23); + b[ 18] = *(a10 + 22); + b[ 19] = *(a10 + 23); + b[ 20] = *(a11 + 22); + b[ 21] = *(a11 + 23); +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 13) { + b[ 0] = *(a01 + 24); + b[ 1] = *(a01 + 25); + b[ 2] = *(a02 + 24); + b[ 3] = *(a02 + 25); + b[ 4] = *(a03 + 24); + b[ 5] = *(a03 + 25); + b[ 6] = *(a04 + 24); + b[ 7] = *(a04 + 25); + b[ 8] = *(a05 + 24); + b[ 9] = *(a05 + 25); + b[ 10] = *(a06 + 24); + b[ 11] = *(a06 + 25); + b[ 12] = *(a07 + 24); + b[ 13] = *(a07 + 25); + b[ 14] = *(a08 + 24); + b[ 15] = *(a08 + 25); + b[ 16] = *(a09 + 24); + b[ 17] = *(a09 + 25); + b[ 18] = *(a10 + 24); + b[ 19] = *(a10 + 25); + b[ 20] = *(a11 + 24); + b[ 21] = *(a11 + 25); + b[ 22] = *(a12 + 24); + b[ 23] = *(a12 + 25); +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 14) { + b[ 0] = *(a01 + 26); + b[ 1] = *(a01 + 27); + b[ 2] = *(a02 + 26); + b[ 3] = *(a02 + 27); + b[ 4] = *(a03 + 26); + b[ 5] = *(a03 + 27); + b[ 6] = *(a04 + 26); + b[ 7] = *(a04 + 27); + b[ 8] = *(a05 + 26); + b[ 9] = *(a05 + 27); + b[ 10] = *(a06 + 26); + b[ 11] = *(a06 + 27); + b[ 12] = *(a07 + 26); + b[ 13] = *(a07 + 27); + b[ 14] = *(a08 + 26); + b[ 15] = *(a08 + 27); + b[ 16] = *(a09 + 26); + b[ 17] = *(a09 + 27); + b[ 18] = *(a10 + 26); + b[ 19] = *(a10 + 27); + b[ 20] = *(a11 + 26); + b[ 21] = *(a11 + 27); + b[ 22] = *(a12 + 26); + b[ 23] = *(a12 + 27); + b[ 24] = *(a13 + 26); + b[ 25] = *(a13 + 27); +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 15) { + b[ 0] = *(a01 + 28); + b[ 1] = *(a01 + 29); + b[ 2] = *(a02 + 28); + b[ 3] = *(a02 + 29); + b[ 4] = *(a03 + 28); + b[ 5] = *(a03 + 29); + b[ 6] = *(a04 + 28); + b[ 7] = *(a04 + 29); + b[ 8] = *(a05 + 28); + b[ 9] = *(a05 + 29); + b[ 10] = *(a06 + 28); + b[ 11] = *(a06 + 29); + b[ 12] = *(a07 + 28); + b[ 13] = *(a07 + 29); + b[ 14] = *(a08 + 28); + b[ 15] = *(a08 + 29); + b[ 16] = *(a09 + 28); + b[ 17] = *(a09 + 29); + b[ 18] = *(a10 + 28); + b[ 19] = *(a10 + 29); + b[ 20] = *(a11 + 28); + b[ 21] = *(a11 + 29); + b[ 22] = *(a12 + 28); + b[ 23] = *(a12 + 29); + b[ 24] = *(a13 + 28); + b[ 25] = *(a13 + 29); + b[ 26] = *(a14 + 28); + b[ 27] = *(a14 + 29); +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X < posY) { + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + b += 128; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a01 + 2); + b[ 17] = *(a01 + 3); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a01 + 4); + b[ 33] = *(a01 + 5); + b[ 34] = *(a02 + 4); + b[ 35] = *(a02 + 5); +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a01 + 6); + b[ 49] = *(a01 + 7); + b[ 50] = *(a02 + 6); + b[ 51] = *(a02 + 7); + b[ 52] = *(a03 + 6); + b[ 53] = *(a03 + 7); +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a01 + 8); + b[ 65] = *(a01 + 9); + b[ 66] = *(a02 + 8); + b[ 67] = *(a02 + 9); + b[ 68] = *(a03 + 8); + b[ 69] = *(a03 + 9); + b[ 70] = *(a04 + 8); + b[ 71] = *(a04 + 9); +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a01 + 10); + b[ 81] = *(a01 + 11); + b[ 82] = *(a02 + 10); + b[ 83] = *(a02 + 11); + b[ 84] = *(a03 + 10); + b[ 85] = *(a03 + 11); + b[ 86] = *(a04 + 10); + b[ 87] = *(a04 + 11); + b[ 88] = *(a05 + 10); + b[ 89] = *(a05 + 11); +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a01 + 12); + b[ 97] = *(a01 + 13); + b[ 98] = *(a02 + 12); + b[ 99] = *(a02 + 13); + b[100] = *(a03 + 12); + b[101] = *(a03 + 13); + b[102] = *(a04 + 12); + b[103] = *(a04 + 13); + b[104] = *(a05 + 12); + b[105] = *(a05 + 13); + b[106] = *(a06 + 12); + b[107] = *(a06 + 13); +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a01 + 14); + b[113] = *(a01 + 15); + b[114] = *(a02 + 14); + b[115] = *(a02 + 15); + b[116] = *(a03 + 14); + b[117] = *(a03 + 15); + b[118] = *(a04 + 14); + b[119] = *(a04 + 15); + b[120] = *(a05 + 14); + b[121] = *(a05 + 15); + b[122] = *(a06 + 14); + b[123] = *(a06 + 15); + b[124] = *(a07 + 14); + b[125] = *(a07 + 15); +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X < posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; */ + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + + if (i >= 2) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a01 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a01 + 5); + b[ 2] = *(a02 + 4); + b[ 3] = *(a02 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a01 + 6); + b[ 1] = *(a01 + 7); + b[ 2] = *(a02 + 6); + b[ 3] = *(a02 + 7); + b[ 4] = *(a03 + 6); + b[ 5] = *(a03 + 7); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(a01 + 8); + b[ 1] = *(a01 + 9); + b[ 2] = *(a02 + 8); + b[ 3] = *(a02 + 9); + b[ 4] = *(a03 + 8); + b[ 5] = *(a03 + 9); + b[ 6] = *(a04 + 8); + b[ 7] = *(a04 + 9); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a01 + 10); + b[ 1] = *(a01 + 11); + b[ 2] = *(a02 + 10); + b[ 3] = *(a02 + 11); + b[ 4] = *(a03 + 10); + b[ 5] = *(a03 + 11); + b[ 6] = *(a04 + 10); + b[ 7] = *(a04 + 11); + b[ 8] = *(a05 + 10); + b[ 9] = *(a05 + 11); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a01 + 12); + b[ 1] = *(a01 + 13); + b[ 2] = *(a02 + 12); + b[ 3] = *(a02 + 13); + b[ 4] = *(a03 + 12); + b[ 5] = *(a03 + 13); + b[ 6] = *(a04 + 12); + b[ 7] = *(a04 + 13); + b[ 8] = *(a05 + 12); + b[ 9] = *(a05 + 13); + b[10] = *(a06 + 12); + b[11] = *(a06 + 13); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X < posY) { + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a01 + 2); + b[ 9] = *(a01 + 3); +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a01 + 4); + b[ 17] = *(a01 + 5); + b[ 18] = *(a02 + 4); + b[ 19] = *(a02 + 5); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a01 + 6); + b[ 25] = *(a01 + 7); + b[ 26] = *(a02 + 6); + b[ 27] = *(a02 + 7); + b[ 28] = *(a03 + 6); + b[ 29] = *(a03 + 7); +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X < posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; */ + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a01 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a01 + 5); + b[ 2] = *(a02 + 4); + b[ 3] = *(a02 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a01 + 2); + b[ 5] = *(a01 + 3); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 4; + a02 += 4; + b += 8; + } else + if (X < posY) { + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a01 + 2); + b[ 5] = *(a01 + 3); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + a01 += 4; + a02 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + /* a01 += 2; + a02 += 2; */ + b += 4; + } else + if (X < posY) { + /* a01 += 2 * lda; + a02 += 2 * lda; */ + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += 2; + b += 2; + } else + if (X < posY) { + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + a01 += 2; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_ltcopy_16.c b/kernel/generic/ztrmm_ltcopy_16.c new file mode 100644 index 000000000..8d585e70b --- /dev/null +++ b/kernel/generic/ztrmm_ltcopy_16.c @@ -0,0 +1,2313 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X > posY) { + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + b += 512; + } else + if (X < posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + b += 32; + } + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + b[ 32] = ZERO; + b[ 33] = ZERO; +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = *(a02 + 4); + b[ 37] = *(a02 + 5); + b[ 38] = *(a02 + 6); + b[ 39] = *(a02 + 7); + b[ 40] = *(a02 + 8); + b[ 41] = *(a02 + 9); + b[ 42] = *(a02 + 10); + b[ 43] = *(a02 + 11); + b[ 44] = *(a02 + 12); + b[ 45] = *(a02 + 13); + b[ 46] = *(a02 + 14); + b[ 47] = *(a02 + 15); + b[ 48] = *(a02 + 16); + b[ 49] = *(a02 + 17); + b[ 50] = *(a02 + 18); + b[ 51] = *(a02 + 19); + b[ 52] = *(a02 + 20); + b[ 53] = *(a02 + 21); + b[ 54] = *(a02 + 22); + b[ 55] = *(a02 + 23); + b[ 56] = *(a02 + 24); + b[ 57] = *(a02 + 25); + b[ 58] = *(a02 + 26); + b[ 59] = *(a02 + 27); + b[ 60] = *(a02 + 28); + b[ 61] = *(a02 + 29); + b[ 62] = *(a02 + 30); + b[ 63] = *(a02 + 31); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = *(a03 + 6); + b[ 71] = *(a03 + 7); + b[ 72] = *(a03 + 8); + b[ 73] = *(a03 + 9); + b[ 74] = *(a03 + 10); + b[ 75] = *(a03 + 11); + b[ 76] = *(a03 + 12); + b[ 77] = *(a03 + 13); + b[ 78] = *(a03 + 14); + b[ 79] = *(a03 + 15); + b[ 80] = *(a03 + 16); + b[ 81] = *(a03 + 17); + b[ 82] = *(a03 + 18); + b[ 83] = *(a03 + 19); + b[ 84] = *(a03 + 20); + b[ 85] = *(a03 + 21); + b[ 86] = *(a03 + 22); + b[ 87] = *(a03 + 23); + b[ 88] = *(a03 + 24); + b[ 89] = *(a03 + 25); + b[ 90] = *(a03 + 26); + b[ 91] = *(a03 + 27); + b[ 92] = *(a03 + 28); + b[ 93] = *(a03 + 29); + b[ 94] = *(a03 + 30); + b[ 95] = *(a03 + 31); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = *(a04 + 8); + b[105] = *(a04 + 9); + b[106] = *(a04 + 10); + b[107] = *(a04 + 11); + b[108] = *(a04 + 12); + b[109] = *(a04 + 13); + b[110] = *(a04 + 14); + b[111] = *(a04 + 15); + b[112] = *(a04 + 16); + b[113] = *(a04 + 17); + b[114] = *(a04 + 18); + b[115] = *(a04 + 19); + b[116] = *(a04 + 20); + b[117] = *(a04 + 21); + b[118] = *(a04 + 22); + b[119] = *(a04 + 23); + b[120] = *(a04 + 24); + b[121] = *(a04 + 25); + b[122] = *(a04 + 26); + b[123] = *(a04 + 27); + b[124] = *(a04 + 28); + b[125] = *(a04 + 29); + b[126] = *(a04 + 30); + b[127] = *(a04 + 31); + + b[128] = ZERO; + b[129] = ZERO; + b[130] = ZERO; + b[131] = ZERO; + b[132] = ZERO; + b[133] = ZERO; + b[134] = ZERO; + b[135] = ZERO; +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = *(a05 + 10); + b[139] = *(a05 + 11); + b[140] = *(a05 + 12); + b[141] = *(a05 + 13); + b[142] = *(a05 + 14); + b[143] = *(a05 + 15); + b[144] = *(a05 + 16); + b[145] = *(a05 + 17); + b[146] = *(a05 + 18); + b[147] = *(a05 + 19); + b[148] = *(a05 + 20); + b[149] = *(a05 + 21); + b[150] = *(a05 + 22); + b[151] = *(a05 + 23); + b[152] = *(a05 + 24); + b[153] = *(a05 + 25); + b[154] = *(a05 + 26); + b[155] = *(a05 + 27); + b[156] = *(a05 + 28); + b[157] = *(a05 + 29); + b[158] = *(a05 + 30); + b[159] = *(a05 + 31); + + b[160] = ZERO; + b[161] = ZERO; + b[162] = ZERO; + b[163] = ZERO; + b[164] = ZERO; + b[165] = ZERO; + b[166] = ZERO; + b[167] = ZERO; + b[168] = ZERO; + b[169] = ZERO; +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = *(a06 + 12); + b[173] = *(a06 + 13); + b[174] = *(a06 + 14); + b[175] = *(a06 + 15); + b[176] = *(a06 + 16); + b[177] = *(a06 + 17); + b[178] = *(a06 + 18); + b[179] = *(a06 + 19); + b[180] = *(a06 + 20); + b[181] = *(a06 + 21); + b[182] = *(a06 + 22); + b[183] = *(a06 + 23); + b[184] = *(a06 + 24); + b[185] = *(a06 + 25); + b[186] = *(a06 + 26); + b[187] = *(a06 + 27); + b[188] = *(a06 + 28); + b[189] = *(a06 + 29); + b[190] = *(a06 + 30); + b[191] = *(a06 + 31); + + b[192] = ZERO; + b[193] = ZERO; + b[194] = ZERO; + b[195] = ZERO; + b[196] = ZERO; + b[197] = ZERO; + b[198] = ZERO; + b[199] = ZERO; + b[200] = ZERO; + b[201] = ZERO; + b[202] = ZERO; + b[203] = ZERO; +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = *(a07 + 14); + b[207] = *(a07 + 15); + b[208] = *(a07 + 16); + b[209] = *(a07 + 17); + b[210] = *(a07 + 18); + b[211] = *(a07 + 19); + b[212] = *(a07 + 20); + b[213] = *(a07 + 21); + b[214] = *(a07 + 22); + b[215] = *(a07 + 23); + b[216] = *(a07 + 24); + b[217] = *(a07 + 25); + b[218] = *(a07 + 26); + b[219] = *(a07 + 27); + b[220] = *(a07 + 28); + b[221] = *(a07 + 29); + b[222] = *(a07 + 30); + b[223] = *(a07 + 31); + + b[224] = ZERO; + b[225] = ZERO; + b[226] = ZERO; + b[227] = ZERO; + b[228] = ZERO; + b[229] = ZERO; + b[230] = ZERO; + b[231] = ZERO; + b[232] = ZERO; + b[233] = ZERO; + b[234] = ZERO; + b[235] = ZERO; + b[236] = ZERO; + b[237] = ZERO; +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = *(a08 + 16); + b[241] = *(a08 + 17); + b[242] = *(a08 + 18); + b[243] = *(a08 + 19); + b[244] = *(a08 + 20); + b[245] = *(a08 + 21); + b[246] = *(a08 + 22); + b[247] = *(a08 + 23); + b[248] = *(a08 + 24); + b[249] = *(a08 + 25); + b[250] = *(a08 + 26); + b[251] = *(a08 + 27); + b[252] = *(a08 + 28); + b[253] = *(a08 + 29); + b[254] = *(a08 + 30); + b[255] = *(a08 + 31); + + b[256] = ZERO; + b[257] = ZERO; + b[258] = ZERO; + b[259] = ZERO; + b[260] = ZERO; + b[261] = ZERO; + b[262] = ZERO; + b[263] = ZERO; + b[264] = ZERO; + b[265] = ZERO; + b[266] = ZERO; + b[267] = ZERO; + b[268] = ZERO; + b[269] = ZERO; + b[270] = ZERO; + b[271] = ZERO; +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = *(a09 + 18); + b[275] = *(a09 + 19); + b[276] = *(a09 + 20); + b[277] = *(a09 + 21); + b[278] = *(a09 + 22); + b[279] = *(a09 + 23); + b[280] = *(a09 + 24); + b[281] = *(a09 + 25); + b[282] = *(a09 + 26); + b[283] = *(a09 + 27); + b[284] = *(a09 + 28); + b[285] = *(a09 + 29); + b[286] = *(a09 + 30); + b[287] = *(a09 + 31); + + b[288] = ZERO; + b[289] = ZERO; + b[290] = ZERO; + b[291] = ZERO; + b[292] = ZERO; + b[293] = ZERO; + b[294] = ZERO; + b[295] = ZERO; + b[296] = ZERO; + b[297] = ZERO; + b[298] = ZERO; + b[299] = ZERO; + b[300] = ZERO; + b[301] = ZERO; + b[302] = ZERO; + b[303] = ZERO; + b[304] = ZERO; + b[305] = ZERO; +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = *(a10 + 20); + b[309] = *(a10 + 21); + b[310] = *(a10 + 22); + b[311] = *(a10 + 23); + b[312] = *(a10 + 24); + b[313] = *(a10 + 25); + b[314] = *(a10 + 26); + b[315] = *(a10 + 27); + b[316] = *(a10 + 28); + b[317] = *(a10 + 29); + b[318] = *(a10 + 30); + b[319] = *(a10 + 31); + + b[320] = ZERO; + b[321] = ZERO; + b[322] = ZERO; + b[323] = ZERO; + b[324] = ZERO; + b[325] = ZERO; + b[326] = ZERO; + b[327] = ZERO; + b[328] = ZERO; + b[329] = ZERO; + b[330] = ZERO; + b[331] = ZERO; + b[332] = ZERO; + b[333] = ZERO; + b[334] = ZERO; + b[335] = ZERO; + b[336] = ZERO; + b[337] = ZERO; + b[338] = ZERO; + b[339] = ZERO; +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = *(a11 + 22); + b[343] = *(a11 + 23); + b[344] = *(a11 + 24); + b[345] = *(a11 + 25); + b[346] = *(a11 + 26); + b[347] = *(a11 + 27); + b[348] = *(a11 + 28); + b[349] = *(a11 + 29); + b[350] = *(a11 + 30); + b[351] = *(a11 + 31); + + b[352] = ZERO; + b[353] = ZERO; + b[354] = ZERO; + b[355] = ZERO; + b[356] = ZERO; + b[357] = ZERO; + b[358] = ZERO; + b[359] = ZERO; + b[360] = ZERO; + b[361] = ZERO; + b[362] = ZERO; + b[363] = ZERO; + b[364] = ZERO; + b[365] = ZERO; + b[366] = ZERO; + b[367] = ZERO; + b[368] = ZERO; + b[369] = ZERO; + b[370] = ZERO; + b[371] = ZERO; + b[372] = ZERO; + b[373] = ZERO; +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = *(a12 + 24); + b[377] = *(a12 + 25); + b[378] = *(a12 + 26); + b[379] = *(a12 + 27); + b[380] = *(a12 + 28); + b[381] = *(a12 + 29); + b[382] = *(a12 + 30); + b[383] = *(a12 + 31); + + b[384] = ZERO; + b[385] = ZERO; + b[386] = ZERO; + b[387] = ZERO; + b[388] = ZERO; + b[389] = ZERO; + b[390] = ZERO; + b[391] = ZERO; + b[392] = ZERO; + b[393] = ZERO; + b[394] = ZERO; + b[395] = ZERO; + b[396] = ZERO; + b[397] = ZERO; + b[398] = ZERO; + b[399] = ZERO; + b[400] = ZERO; + b[401] = ZERO; + b[402] = ZERO; + b[403] = ZERO; + b[404] = ZERO; + b[405] = ZERO; + b[406] = ZERO; + b[407] = ZERO; +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = *(a13 + 26); + b[411] = *(a13 + 27); + b[412] = *(a13 + 28); + b[413] = *(a13 + 29); + b[414] = *(a13 + 30); + b[415] = *(a13 + 31); + + b[416] = ZERO; + b[417] = ZERO; + b[418] = ZERO; + b[419] = ZERO; + b[420] = ZERO; + b[421] = ZERO; + b[422] = ZERO; + b[423] = ZERO; + b[424] = ZERO; + b[425] = ZERO; + b[426] = ZERO; + b[427] = ZERO; + b[428] = ZERO; + b[429] = ZERO; + b[430] = ZERO; + b[431] = ZERO; + b[432] = ZERO; + b[433] = ZERO; + b[434] = ZERO; + b[435] = ZERO; + b[436] = ZERO; + b[437] = ZERO; + b[438] = ZERO; + b[439] = ZERO; + b[440] = ZERO; + b[441] = ZERO; +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = *(a14 + 28); + b[445] = *(a14 + 29); + b[446] = *(a14 + 30); + b[447] = *(a14 + 31); + + b[448] = ZERO; + b[449] = ZERO; + b[450] = ZERO; + b[451] = ZERO; + b[452] = ZERO; + b[453] = ZERO; + b[454] = ZERO; + b[455] = ZERO; + b[456] = ZERO; + b[457] = ZERO; + b[458] = ZERO; + b[459] = ZERO; + b[460] = ZERO; + b[461] = ZERO; + b[462] = ZERO; + b[463] = ZERO; + b[464] = ZERO; + b[465] = ZERO; + b[466] = ZERO; + b[467] = ZERO; + b[468] = ZERO; + b[469] = ZERO; + b[470] = ZERO; + b[471] = ZERO; + b[472] = ZERO; + b[473] = ZERO; + b[474] = ZERO; + b[475] = ZERO; +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = *(a15 + 30); + b[479] = *(a15 + 31); + + b[480] = ZERO; + b[481] = ZERO; + b[482] = ZERO; + b[483] = ZERO; + b[484] = ZERO; + b[485] = ZERO; + b[486] = ZERO; + b[487] = ZERO; + b[488] = ZERO; + b[489] = ZERO; + b[490] = ZERO; + b[491] = ZERO; + b[492] = ZERO; + b[493] = ZERO; + b[494] = ZERO; + b[495] = ZERO; + b[496] = ZERO; + b[497] = ZERO; + b[498] = ZERO; + b[499] = ZERO; + b[500] = ZERO; + b[501] = ZERO; + b[502] = ZERO; + b[503] = ZERO; + b[504] = ZERO; + b[505] = ZERO; + b[506] = ZERO; + b[507] = ZERO; + b[508] = ZERO; + b[509] = ZERO; +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; */ + b += 32 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + a09 += lda; + a10 += lda; + a11 += lda; + a12 += lda; + a13 += lda; + a14 += lda; + a15 += lda; + a16 += lda; + b += 32; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + b += 32; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b[ 8] = *(a02 + 8); + b[ 9] = *(a02 + 9); + b[ 10] = *(a02 + 10); + b[ 11] = *(a02 + 11); + b[ 12] = *(a02 + 12); + b[ 13] = *(a02 + 13); + b[ 14] = *(a02 + 14); + b[ 15] = *(a02 + 15); + b[ 16] = *(a02 + 16); + b[ 17] = *(a02 + 17); + b[ 18] = *(a02 + 18); + b[ 19] = *(a02 + 19); + b[ 20] = *(a02 + 20); + b[ 21] = *(a02 + 21); + b[ 22] = *(a02 + 22); + b[ 23] = *(a02 + 23); + b[ 24] = *(a02 + 24); + b[ 25] = *(a02 + 25); + b[ 26] = *(a02 + 26); + b[ 27] = *(a02 + 27); + b[ 28] = *(a02 + 28); + b[ 29] = *(a02 + 29); + b[ 30] = *(a02 + 30); + b[ 31] = *(a02 + 31); + b += 32; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b[ 8] = *(a03 + 8); + b[ 9] = *(a03 + 9); + b[ 10] = *(a03 + 10); + b[ 11] = *(a03 + 11); + b[ 12] = *(a03 + 12); + b[ 13] = *(a03 + 13); + b[ 14] = *(a03 + 14); + b[ 15] = *(a03 + 15); + b[ 16] = *(a03 + 16); + b[ 17] = *(a03 + 17); + b[ 18] = *(a03 + 18); + b[ 19] = *(a03 + 19); + b[ 20] = *(a03 + 20); + b[ 21] = *(a03 + 21); + b[ 22] = *(a03 + 22); + b[ 23] = *(a03 + 23); + b[ 24] = *(a03 + 24); + b[ 25] = *(a03 + 25); + b[ 26] = *(a03 + 26); + b[ 27] = *(a03 + 27); + b[ 28] = *(a03 + 28); + b[ 29] = *(a03 + 29); + b[ 30] = *(a03 + 30); + b[ 31] = *(a03 + 31); + b += 32; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = *(a04 + 8); + b[ 9] = *(a04 + 9); + b[ 10] = *(a04 + 10); + b[ 11] = *(a04 + 11); + b[ 12] = *(a04 + 12); + b[ 13] = *(a04 + 13); + b[ 14] = *(a04 + 14); + b[ 15] = *(a04 + 15); + b[ 16] = *(a04 + 16); + b[ 17] = *(a04 + 17); + b[ 18] = *(a04 + 18); + b[ 19] = *(a04 + 19); + b[ 20] = *(a04 + 20); + b[ 21] = *(a04 + 21); + b[ 22] = *(a04 + 22); + b[ 23] = *(a04 + 23); + b[ 24] = *(a04 + 24); + b[ 25] = *(a04 + 25); + b[ 26] = *(a04 + 26); + b[ 27] = *(a04 + 27); + b[ 28] = *(a04 + 28); + b[ 29] = *(a04 + 29); + b[ 30] = *(a04 + 30); + b[ 31] = *(a04 + 31); + b += 32; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = *(a05 + 10); + b[ 11] = *(a05 + 11); + b[ 12] = *(a05 + 12); + b[ 13] = *(a05 + 13); + b[ 14] = *(a05 + 14); + b[ 15] = *(a05 + 15); + b[ 16] = *(a05 + 16); + b[ 17] = *(a05 + 17); + b[ 18] = *(a05 + 18); + b[ 19] = *(a05 + 19); + b[ 20] = *(a05 + 20); + b[ 21] = *(a05 + 21); + b[ 22] = *(a05 + 22); + b[ 23] = *(a05 + 23); + b[ 24] = *(a05 + 24); + b[ 25] = *(a05 + 25); + b[ 26] = *(a05 + 26); + b[ 27] = *(a05 + 27); + b[ 28] = *(a05 + 28); + b[ 29] = *(a05 + 29); + b[ 30] = *(a05 + 30); + b[ 31] = *(a05 + 31); + b += 32; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = *(a06 + 12); + b[ 13] = *(a06 + 13); + b[ 14] = *(a06 + 14); + b[ 15] = *(a06 + 15); + b[ 16] = *(a06 + 16); + b[ 17] = *(a06 + 17); + b[ 18] = *(a06 + 18); + b[ 19] = *(a06 + 19); + b[ 20] = *(a06 + 20); + b[ 21] = *(a06 + 21); + b[ 22] = *(a06 + 22); + b[ 23] = *(a06 + 23); + b[ 24] = *(a06 + 24); + b[ 25] = *(a06 + 25); + b[ 26] = *(a06 + 26); + b[ 27] = *(a06 + 27); + b[ 28] = *(a06 + 28); + b[ 29] = *(a06 + 29); + b[ 30] = *(a06 + 30); + b[ 31] = *(a06 + 31); + b += 32; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = *(a07 + 14); + b[ 15] = *(a07 + 15); + b[ 16] = *(a07 + 16); + b[ 17] = *(a07 + 17); + b[ 18] = *(a07 + 18); + b[ 19] = *(a07 + 19); + b[ 20] = *(a07 + 20); + b[ 21] = *(a07 + 21); + b[ 22] = *(a07 + 22); + b[ 23] = *(a07 + 23); + b[ 24] = *(a07 + 24); + b[ 25] = *(a07 + 25); + b[ 26] = *(a07 + 26); + b[ 27] = *(a07 + 27); + b[ 28] = *(a07 + 28); + b[ 29] = *(a07 + 29); + b[ 30] = *(a07 + 30); + b[ 31] = *(a07 + 31); + b += 32; + } + + if (i >= 8) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = *(a08 + 16); + b[ 17] = *(a08 + 17); + b[ 18] = *(a08 + 18); + b[ 19] = *(a08 + 19); + b[ 20] = *(a08 + 20); + b[ 21] = *(a08 + 21); + b[ 22] = *(a08 + 22); + b[ 23] = *(a08 + 23); + b[ 24] = *(a08 + 24); + b[ 25] = *(a08 + 25); + b[ 26] = *(a08 + 26); + b[ 27] = *(a08 + 27); + b[ 28] = *(a08 + 28); + b[ 29] = *(a08 + 29); + b[ 30] = *(a08 + 30); + b[ 31] = *(a08 + 31); + b += 32; + } + + if (i >= 9) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = *(a09 + 18); + b[ 19] = *(a09 + 19); + b[ 20] = *(a09 + 20); + b[ 21] = *(a09 + 21); + b[ 22] = *(a09 + 22); + b[ 23] = *(a09 + 23); + b[ 24] = *(a09 + 24); + b[ 25] = *(a09 + 25); + b[ 26] = *(a09 + 26); + b[ 27] = *(a09 + 27); + b[ 28] = *(a09 + 28); + b[ 29] = *(a09 + 29); + b[ 30] = *(a09 + 30); + b[ 31] = *(a09 + 31); + b += 32; + } + + if (i >= 10) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = *(a10 + 20); + b[ 21] = *(a10 + 21); + b[ 22] = *(a10 + 22); + b[ 23] = *(a10 + 23); + b[ 24] = *(a10 + 24); + b[ 25] = *(a10 + 25); + b[ 26] = *(a10 + 26); + b[ 27] = *(a10 + 27); + b[ 28] = *(a10 + 28); + b[ 29] = *(a10 + 29); + b[ 30] = *(a10 + 30); + b[ 31] = *(a10 + 31); + b += 32; + } + + if (i >= 11) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = *(a11 + 22); + b[ 23] = *(a11 + 23); + b[ 24] = *(a11 + 24); + b[ 25] = *(a11 + 25); + b[ 26] = *(a11 + 26); + b[ 27] = *(a11 + 27); + b[ 28] = *(a11 + 28); + b[ 29] = *(a11 + 29); + b[ 30] = *(a11 + 30); + b[ 31] = *(a11 + 31); + b += 32; + } + + if (i >= 12) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = *(a12 + 24); + b[ 25] = *(a12 + 25); + b[ 26] = *(a12 + 26); + b[ 27] = *(a12 + 27); + b[ 28] = *(a12 + 28); + b[ 29] = *(a12 + 29); + b[ 30] = *(a12 + 30); + b[ 31] = *(a12 + 31); + b += 32; + } + + if (i >= 13) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = *(a13 + 26); + b[ 27] = *(a13 + 27); + b[ 28] = *(a13 + 28); + b[ 29] = *(a12 + 29); + b[ 30] = *(a13 + 30); + b[ 31] = *(a13 + 31); + b += 32; + } + + if (i >= 14) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = *(a14 + 28); + b[ 29] = *(a14 + 29); + b[ 30] = *(a14 + 30); + b[ 31] = *(a14 + 31); + b += 32; + } + + if (i >= 15) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = *(a15 + 30); + b[ 31] = *(a15 + 31); + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } else + if (X < posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = *(a02 + 4); + b[ 21] = *(a02 + 5); + b[ 22] = *(a02 + 6); + b[ 23] = *(a02 + 7); + b[ 24] = *(a02 + 8); + b[ 25] = *(a02 + 9); + b[ 26] = *(a02 + 10); + b[ 27] = *(a02 + 11); + b[ 28] = *(a02 + 12); + b[ 29] = *(a02 + 13); + b[ 30] = *(a02 + 14); + b[ 31] = *(a02 + 15); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = *(a03 + 6); + b[ 39] = *(a03 + 7); + b[ 40] = *(a03 + 8); + b[ 41] = *(a03 + 9); + b[ 42] = *(a03 + 10); + b[ 43] = *(a03 + 11); + b[ 44] = *(a03 + 12); + b[ 45] = *(a03 + 13); + b[ 46] = *(a03 + 14); + b[ 47] = *(a03 + 15); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = *(a04 + 8); + b[ 57] = *(a04 + 9); + b[ 58] = *(a04 + 10); + b[ 59] = *(a04 + 11); + b[ 60] = *(a04 + 12); + b[ 61] = *(a04 + 13); + b[ 62] = *(a04 + 14); + b[ 63] = *(a04 + 15); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; + b[ 68] = ZERO; + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = *(a05 + 10); + b[ 75] = *(a05 + 11); + b[ 76] = *(a05 + 12); + b[ 77] = *(a05 + 13); + b[ 78] = *(a05 + 14); + b[ 79] = *(a05 + 15); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = *(a06 + 12); + b[ 93] = *(a06 + 13); + b[ 94] = *(a06 + 14); + b[ 95] = *(a06 + 15); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; + b[102] = ZERO; + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = *(a07 + 14); + b[111] = *(a07 + 15); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + a05 += 2 * i; + a06 += 2 * i; + a07 += 2 * i; + a08 += 2 * i; */ + b += 16 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 16; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b += 16; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + + b[ 8] = *(a02 + 8); + b[ 9] = *(a02 + 9); + b[10] = *(a02 + 10); + b[11] = *(a02 + 11); + b[12] = *(a02 + 12); + b[13] = *(a02 + 13); + b[14] = *(a02 + 14); + b[15] = *(a02 + 15); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + + b[ 8] = *(a03 + 8); + b[ 9] = *(a03 + 9); + b[10] = *(a03 + 10); + b[11] = *(a03 + 11); + b[12] = *(a03 + 12); + b[13] = *(a03 + 13); + b[14] = *(a03 + 14); + b[15] = *(a03 + 15); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + + b[ 8] = *(a04 + 8); + b[ 9] = *(a04 + 9); + b[10] = *(a04 + 10); + b[11] = *(a04 + 11); + b[12] = *(a04 + 12); + b[13] = *(a04 + 13); + b[14] = *(a04 + 14); + b[15] = *(a04 + 15); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = *(a05 + 10); + b[11] = *(a05 + 11); + b[12] = *(a05 + 12); + b[13] = *(a05 + 13); + b[14] = *(a05 + 14); + b[15] = *(a05 + 15); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = *(a06 + 12); + b[13] = *(a06 + 13); + b[14] = *(a06 + 14); + b[15] = *(a06 + 15); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = *(a07 + 14); + b[15] = *(a07 + 15); + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } else + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = *(a02 + 4); + b[ 13] = *(a02 + 5); + b[ 14] = *(a02 + 6); + b[ 15] = *(a02 + 7); + + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = *(a03 + 6); + b[ 23] = *(a03 + 7); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i > 0) { + if (X > posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; */ + b += 8 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 8; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b += 8; + } + } + } + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + a01 += 4; + a02 += 4; + b += 8; + } else + if (X < posY) { + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); + b[2] = *(a01 + 2); + b[3] = *(a01 + 3); + b[4] = *(a02 + 0); + b[5] = *(a02 + 1); + b[6] = *(a02 + 2); + b[7] = *(a02 + 3); + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[0] = ONE; + b[1] = ZERO; +#else + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); +#endif + b[2] = *(a01 + 2); + b[3] = *(a01 + 3); + + b[4] = ZERO; + b[5] = ZERO; +#ifdef UNIT + b[6] = ONE; + b[7] = ZERO; +#else + b[6] = *(a02 + 2); + b[7] = *(a02 + 3); +#endif + a01 += 4; + a02 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i > 0) { + if (X > posY) { + /* a01 += 2; + a02 += 2; */ + b += 4; + } else + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + + /* a01 += lda; + a02 += lda; */ + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + a01 += 2; + b += 2; + } else + if (X < posY) { + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[0] = ONE; + b[1] = ZERO; +#else + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); +#endif + a01 += 2; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + // posY += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrmm_uncopy_16.c b/kernel/generic/ztrmm_uncopy_16.c new file mode 100644 index 000000000..40b85db38 --- /dev/null +++ b/kernel/generic/ztrmm_uncopy_16.c @@ -0,0 +1,2316 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X > posY) { + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + b += 512; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + b[ 32] = ZERO; + b[ 33] = ZERO; +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = *(a03 + 2); + b[ 37] = *(a03 + 3); + b[ 38] = *(a04 + 2); + b[ 39] = *(a04 + 3); + b[ 40] = *(a05 + 2); + b[ 41] = *(a05 + 3); + b[ 42] = *(a06 + 2); + b[ 43] = *(a06 + 3); + b[ 44] = *(a07 + 2); + b[ 45] = *(a07 + 3); + b[ 46] = *(a08 + 2); + b[ 47] = *(a08 + 3); + b[ 48] = *(a09 + 2); + b[ 49] = *(a09 + 3); + b[ 50] = *(a10 + 2); + b[ 51] = *(a10 + 3); + b[ 52] = *(a11 + 2); + b[ 53] = *(a11 + 3); + b[ 54] = *(a12 + 2); + b[ 55] = *(a12 + 3); + b[ 56] = *(a13 + 2); + b[ 57] = *(a13 + 3); + b[ 58] = *(a14 + 2); + b[ 59] = *(a14 + 3); + b[ 60] = *(a15 + 2); + b[ 61] = *(a15 + 3); + b[ 62] = *(a16 + 2); + b[ 63] = *(a16 + 3); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = *(a04 + 4); + b[ 71] = *(a04 + 5); + b[ 72] = *(a05 + 4); + b[ 73] = *(a05 + 5); + b[ 74] = *(a06 + 4); + b[ 75] = *(a06 + 5); + b[ 76] = *(a07 + 4); + b[ 77] = *(a07 + 5); + b[ 78] = *(a08 + 4); + b[ 79] = *(a08 + 5); + b[ 80] = *(a09 + 4); + b[ 81] = *(a09 + 5); + b[ 82] = *(a10 + 4); + b[ 83] = *(a10 + 5); + b[ 84] = *(a11 + 4); + b[ 85] = *(a11 + 5); + b[ 86] = *(a12 + 4); + b[ 87] = *(a12 + 5); + b[ 88] = *(a13 + 4); + b[ 89] = *(a13 + 5); + b[ 90] = *(a14 + 4); + b[ 91] = *(a14 + 5); + b[ 92] = *(a15 + 4); + b[ 93] = *(a15 + 5); + b[ 94] = *(a16 + 4); + b[ 95] = *(a16 + 5); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = *(a05 + 6); + b[105] = *(a05 + 7); + b[106] = *(a06 + 6); + b[107] = *(a06 + 7); + b[108] = *(a07 + 6); + b[109] = *(a07 + 7); + b[110] = *(a08 + 6); + b[111] = *(a08 + 7); + b[112] = *(a09 + 6); + b[113] = *(a09 + 7); + b[114] = *(a10 + 6); + b[115] = *(a10 + 7); + b[116] = *(a11 + 6); + b[117] = *(a11 + 7); + b[118] = *(a12 + 6); + b[119] = *(a12 + 7); + b[120] = *(a13 + 6); + b[121] = *(a13 + 7); + b[122] = *(a14 + 6); + b[123] = *(a14 + 7); + b[124] = *(a15 + 6); + b[125] = *(a15 + 7); + b[126] = *(a16 + 6); + b[127] = *(a16 + 7); + + b[128] = ZERO; + b[129] = ZERO; + b[130] = ZERO; + b[131] = ZERO; + b[132] = ZERO; + b[133] = ZERO; + b[134] = ZERO; + b[135] = ZERO; +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = *(a06 + 8); + b[139] = *(a06 + 9); + b[140] = *(a07 + 8); + b[141] = *(a07 + 9); + b[142] = *(a08 + 8); + b[143] = *(a08 + 9); + b[144] = *(a09 + 8); + b[145] = *(a09 + 9); + b[146] = *(a10 + 8); + b[147] = *(a10 + 9); + b[148] = *(a11 + 8); + b[149] = *(a11 + 9); + b[150] = *(a12 + 8); + b[151] = *(a12 + 9); + b[152] = *(a13 + 8); + b[153] = *(a13 + 9); + b[154] = *(a14 + 8); + b[155] = *(a14 + 9); + b[156] = *(a15 + 8); + b[157] = *(a15 + 9); + b[158] = *(a16 + 8); + b[159] = *(a16 + 9); + + b[160] = ZERO; + b[161] = ZERO; + b[162] = ZERO; + b[163] = ZERO; + b[164] = ZERO; + b[165] = ZERO; + b[166] = ZERO; + b[167] = ZERO; + b[168] = ZERO; + b[169] = ZERO; +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = *(a07 + 10); + b[173] = *(a07 + 11); + b[174] = *(a08 + 10); + b[175] = *(a08 + 11); + b[176] = *(a09 + 10); + b[177] = *(a09 + 11); + b[178] = *(a10 + 10); + b[179] = *(a10 + 11); + b[180] = *(a11 + 10); + b[181] = *(a11 + 11); + b[182] = *(a12 + 10); + b[183] = *(a12 + 11); + b[184] = *(a13 + 10); + b[185] = *(a13 + 11); + b[186] = *(a14 + 10); + b[187] = *(a14 + 11); + b[188] = *(a15 + 10); + b[189] = *(a15 + 11); + b[190] = *(a16 + 10); + b[191] = *(a16 + 11); + + b[192] = ZERO; + b[193] = ZERO; + b[194] = ZERO; + b[195] = ZERO; + b[196] = ZERO; + b[197] = ZERO; + b[198] = ZERO; + b[199] = ZERO; + b[200] = ZERO; + b[201] = ZERO; + b[202] = ZERO; + b[203] = ZERO; +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = *(a08 + 12); + b[207] = *(a08 + 13); + b[208] = *(a09 + 12); + b[209] = *(a09 + 13); + b[210] = *(a10 + 12); + b[211] = *(a10 + 13); + b[212] = *(a11 + 12); + b[213] = *(a11 + 13); + b[214] = *(a12 + 12); + b[215] = *(a12 + 13); + b[216] = *(a13 + 12); + b[217] = *(a13 + 13); + b[218] = *(a14 + 12); + b[219] = *(a14 + 13); + b[220] = *(a15 + 12); + b[221] = *(a15 + 13); + b[222] = *(a16 + 12); + b[223] = *(a16 + 13); + + b[224] = ZERO; + b[225] = ZERO; + b[226] = ZERO; + b[227] = ZERO; + b[228] = ZERO; + b[229] = ZERO; + b[230] = ZERO; + b[231] = ZERO; + b[232] = ZERO; + b[233] = ZERO; + b[234] = ZERO; + b[235] = ZERO; + b[236] = ZERO; + b[237] = ZERO; +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = *(a09 + 14); + b[241] = *(a09 + 15); + b[242] = *(a10 + 14); + b[243] = *(a10 + 15); + b[244] = *(a11 + 14); + b[245] = *(a11 + 15); + b[246] = *(a12 + 14); + b[247] = *(a12 + 15); + b[248] = *(a13 + 14); + b[249] = *(a13 + 15); + b[250] = *(a14 + 14); + b[251] = *(a14 + 15); + b[252] = *(a15 + 14); + b[253] = *(a15 + 15); + b[254] = *(a16 + 14); + b[255] = *(a16 + 15); + + b[256] = ZERO; + b[257] = ZERO; + b[258] = ZERO; + b[259] = ZERO; + b[260] = ZERO; + b[261] = ZERO; + b[262] = ZERO; + b[263] = ZERO; + b[264] = ZERO; + b[265] = ZERO; + b[266] = ZERO; + b[267] = ZERO; + b[268] = ZERO; + b[269] = ZERO; + b[270] = ZERO; + b[271] = ZERO; +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = *(a10 + 16); + b[275] = *(a10 + 17); + b[276] = *(a11 + 16); + b[277] = *(a11 + 17); + b[278] = *(a12 + 16); + b[279] = *(a12 + 17); + b[280] = *(a13 + 16); + b[281] = *(a13 + 17); + b[282] = *(a14 + 16); + b[283] = *(a14 + 17); + b[284] = *(a15 + 16); + b[285] = *(a15 + 17); + b[286] = *(a16 + 16); + b[287] = *(a16 + 17); + + b[288] = ZERO; + b[289] = ZERO; + b[290] = ZERO; + b[291] = ZERO; + b[292] = ZERO; + b[293] = ZERO; + b[294] = ZERO; + b[295] = ZERO; + b[296] = ZERO; + b[297] = ZERO; + b[298] = ZERO; + b[299] = ZERO; + b[300] = ZERO; + b[301] = ZERO; + b[302] = ZERO; + b[303] = ZERO; + b[304] = ZERO; + b[305] = ZERO; +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = *(a11 + 18); + b[309] = *(a11 + 19); + b[310] = *(a12 + 18); + b[311] = *(a12 + 19); + b[312] = *(a13 + 18); + b[313] = *(a13 + 19); + b[314] = *(a14 + 18); + b[315] = *(a14 + 19); + b[316] = *(a15 + 18); + b[317] = *(a15 + 19); + b[318] = *(a16 + 18); + b[319] = *(a16 + 19); + + b[320] = ZERO; + b[321] = ZERO; + b[322] = ZERO; + b[323] = ZERO; + b[324] = ZERO; + b[325] = ZERO; + b[326] = ZERO; + b[327] = ZERO; + b[328] = ZERO; + b[329] = ZERO; + b[330] = ZERO; + b[331] = ZERO; + b[332] = ZERO; + b[333] = ZERO; + b[334] = ZERO; + b[335] = ZERO; + b[336] = ZERO; + b[337] = ZERO; + b[338] = ZERO; + b[339] = ZERO; +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = *(a12 + 20); + b[343] = *(a12 + 21); + b[344] = *(a13 + 20); + b[345] = *(a13 + 21); + b[346] = *(a14 + 20); + b[347] = *(a14 + 21); + b[348] = *(a15 + 20); + b[349] = *(a15 + 21); + b[350] = *(a16 + 20); + b[351] = *(a16 + 21); + + b[352] = ZERO; + b[353] = ZERO; + b[354] = ZERO; + b[355] = ZERO; + b[356] = ZERO; + b[357] = ZERO; + b[358] = ZERO; + b[359] = ZERO; + b[360] = ZERO; + b[361] = ZERO; + b[362] = ZERO; + b[363] = ZERO; + b[364] = ZERO; + b[365] = ZERO; + b[366] = ZERO; + b[367] = ZERO; + b[368] = ZERO; + b[369] = ZERO; + b[370] = ZERO; + b[371] = ZERO; + b[372] = ZERO; + b[373] = ZERO; +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = *(a13 + 22); + b[377] = *(a13 + 23); + b[378] = *(a14 + 22); + b[379] = *(a14 + 23); + b[380] = *(a15 + 22); + b[381] = *(a15 + 23); + b[382] = *(a16 + 22); + b[383] = *(a16 + 23); + + b[384] = ZERO; + b[385] = ZERO; + b[386] = ZERO; + b[387] = ZERO; + b[388] = ZERO; + b[389] = ZERO; + b[390] = ZERO; + b[391] = ZERO; + b[392] = ZERO; + b[393] = ZERO; + b[394] = ZERO; + b[395] = ZERO; + b[396] = ZERO; + b[397] = ZERO; + b[398] = ZERO; + b[399] = ZERO; + b[400] = ZERO; + b[401] = ZERO; + b[402] = ZERO; + b[403] = ZERO; + b[404] = ZERO; + b[405] = ZERO; + b[406] = ZERO; + b[407] = ZERO; +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = *(a14 + 24); + b[411] = *(a14 + 25); + b[412] = *(a15 + 24); + b[413] = *(a15 + 25); + b[414] = *(a16 + 24); + b[415] = *(a16 + 25); + + b[416] = ZERO; + b[417] = ZERO; + b[418] = ZERO; + b[419] = ZERO; + b[420] = ZERO; + b[421] = ZERO; + b[422] = ZERO; + b[423] = ZERO; + b[424] = ZERO; + b[425] = ZERO; + b[426] = ZERO; + b[427] = ZERO; + b[428] = ZERO; + b[429] = ZERO; + b[430] = ZERO; + b[431] = ZERO; + b[432] = ZERO; + b[433] = ZERO; + b[434] = ZERO; + b[435] = ZERO; + b[436] = ZERO; + b[437] = ZERO; + b[438] = ZERO; + b[439] = ZERO; + b[440] = ZERO; + b[441] = ZERO; +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = *(a15 + 26); + b[445] = *(a15 + 27); + b[446] = *(a16 + 26); + b[447] = *(a16 + 27); + + b[448] = ZERO; + b[449] = ZERO; + b[450] = ZERO; + b[451] = ZERO; + b[452] = ZERO; + b[453] = ZERO; + b[454] = ZERO; + b[455] = ZERO; + b[456] = ZERO; + b[457] = ZERO; + b[458] = ZERO; + b[459] = ZERO; + b[460] = ZERO; + b[461] = ZERO; + b[462] = ZERO; + b[463] = ZERO; + b[464] = ZERO; + b[465] = ZERO; + b[466] = ZERO; + b[467] = ZERO; + b[468] = ZERO; + b[469] = ZERO; + b[470] = ZERO; + b[471] = ZERO; + b[472] = ZERO; + b[473] = ZERO; + b[474] = ZERO; + b[475] = ZERO; +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = *(a16 + 28); + b[479] = *(a16 + 29); + + b[480] = ZERO; + b[481] = ZERO; + b[482] = ZERO; + b[483] = ZERO; + b[484] = ZERO; + b[485] = ZERO; + b[486] = ZERO; + b[487] = ZERO; + b[488] = ZERO; + b[489] = ZERO; + b[490] = ZERO; + b[491] = ZERO; + b[492] = ZERO; + b[493] = ZERO; + b[494] = ZERO; + b[495] = ZERO; + b[496] = ZERO; + b[497] = ZERO; + b[498] = ZERO; + b[499] = ZERO; + b[500] = ZERO; + b[501] = ZERO; + b[502] = ZERO; + b[503] = ZERO; + b[504] = ZERO; + b[505] = ZERO; + b[506] = ZERO; + b[507] = ZERO; + b[508] = ZERO; + b[509] = ZERO; +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + a09 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + } + } else + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; */ + b += 32 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + b[ 16] = *(a09 + 0); + b[ 17] = *(a09 + 1); + b[ 18] = *(a10 + 0); + b[ 19] = *(a10 + 1); + b[ 20] = *(a11 + 0); + b[ 21] = *(a11 + 1); + b[ 22] = *(a12 + 0); + b[ 23] = *(a12 + 1); + b[ 24] = *(a13 + 0); + b[ 25] = *(a13 + 1); + b[ 26] = *(a14 + 0); + b[ 27] = *(a14 + 1); + b[ 28] = *(a15 + 0); + b[ 29] = *(a15 + 1); + b[ 30] = *(a16 + 0); + b[ 31] = *(a16 + 1); + b += 32; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a03 + 2); + b[ 5] = *(a03 + 3); + b[ 6] = *(a04 + 2); + b[ 7] = *(a04 + 3); + b[ 8] = *(a05 + 2); + b[ 9] = *(a05 + 3); + b[ 10] = *(a06 + 2); + b[ 11] = *(a06 + 3); + b[ 12] = *(a07 + 2); + b[ 13] = *(a07 + 3); + b[ 14] = *(a08 + 2); + b[ 15] = *(a08 + 3); + b[ 16] = *(a09 + 2); + b[ 17] = *(a09 + 3); + b[ 18] = *(a10 + 2); + b[ 19] = *(a10 + 3); + b[ 20] = *(a11 + 2); + b[ 21] = *(a11 + 3); + b[ 22] = *(a12 + 2); + b[ 23] = *(a12 + 3); + b[ 24] = *(a13 + 2); + b[ 25] = *(a13 + 3); + b[ 26] = *(a14 + 2); + b[ 27] = *(a14 + 3); + b[ 28] = *(a15 + 2); + b[ 29] = *(a15 + 3); + b[ 30] = *(a16 + 2); + b[ 31] = *(a16 + 3); + b += 32; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a04 + 4); + b[ 7] = *(a04 + 5); + b[ 8] = *(a05 + 4); + b[ 9] = *(a05 + 5); + b[ 10] = *(a06 + 4); + b[ 11] = *(a06 + 5); + b[ 12] = *(a07 + 4); + b[ 13] = *(a07 + 5); + b[ 14] = *(a08 + 4); + b[ 15] = *(a08 + 5); + b[ 16] = *(a09 + 4); + b[ 17] = *(a09 + 5); + b[ 18] = *(a10 + 4); + b[ 19] = *(a10 + 5); + b[ 20] = *(a11 + 4); + b[ 21] = *(a11 + 5); + b[ 22] = *(a12 + 4); + b[ 23] = *(a12 + 5); + b[ 24] = *(a13 + 4); + b[ 25] = *(a13 + 5); + b[ 26] = *(a14 + 4); + b[ 27] = *(a14 + 5); + b[ 28] = *(a15 + 4); + b[ 29] = *(a15 + 5); + b[ 30] = *(a16 + 4); + b[ 31] = *(a16 + 5); + b += 32; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = *(a05 + 6); + b[ 9] = *(a05 + 7); + b[ 10] = *(a06 + 6); + b[ 11] = *(a06 + 7); + b[ 12] = *(a07 + 6); + b[ 13] = *(a07 + 7); + b[ 14] = *(a08 + 6); + b[ 15] = *(a08 + 7); + b[ 16] = *(a09 + 6); + b[ 17] = *(a09 + 7); + b[ 18] = *(a10 + 6); + b[ 19] = *(a10 + 7); + b[ 20] = *(a11 + 6); + b[ 21] = *(a11 + 7); + b[ 22] = *(a12 + 6); + b[ 23] = *(a12 + 7); + b[ 24] = *(a13 + 6); + b[ 25] = *(a13 + 7); + b[ 26] = *(a14 + 6); + b[ 27] = *(a14 + 7); + b[ 28] = *(a15 + 6); + b[ 29] = *(a15 + 7); + b[ 30] = *(a16 + 6); + b[ 31] = *(a16 + 7); + b += 32; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = *(a06 + 8); + b[ 11] = *(a06 + 9); + b[ 12] = *(a07 + 8); + b[ 13] = *(a07 + 9); + b[ 14] = *(a08 + 8); + b[ 15] = *(a08 + 9); + b[ 16] = *(a09 + 8); + b[ 17] = *(a09 + 9); + b[ 18] = *(a10 + 8); + b[ 19] = *(a10 + 9); + b[ 20] = *(a11 + 8); + b[ 21] = *(a11 + 9); + b[ 22] = *(a12 + 8); + b[ 23] = *(a12 + 9); + b[ 24] = *(a13 + 8); + b[ 25] = *(a13 + 9); + b[ 26] = *(a14 + 8); + b[ 27] = *(a14 + 9); + b[ 28] = *(a15 + 8); + b[ 29] = *(a15 + 9); + b[ 30] = *(a16 + 8); + b[ 31] = *(a16 + 9); + b += 32; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = *(a07 + 10); + b[ 13] = *(a07 + 11); + b[ 14] = *(a08 + 10); + b[ 15] = *(a08 + 11); + b[ 16] = *(a09 + 10); + b[ 17] = *(a09 + 11); + b[ 18] = *(a10 + 10); + b[ 19] = *(a10 + 11); + b[ 20] = *(a11 + 10); + b[ 21] = *(a11 + 11); + b[ 22] = *(a12 + 10); + b[ 23] = *(a12 + 11); + b[ 24] = *(a13 + 10); + b[ 25] = *(a13 + 11); + b[ 26] = *(a14 + 10); + b[ 27] = *(a14 + 11); + b[ 28] = *(a15 + 10); + b[ 29] = *(a15 + 11); + b[ 30] = *(a16 + 10); + b[ 31] = *(a16 + 11); + b += 32; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = *(a08 + 12); + b[ 15] = *(a08 + 13); + b[ 16] = *(a09 + 12); + b[ 17] = *(a09 + 13); + b[ 18] = *(a10 + 12); + b[ 19] = *(a10 + 13); + b[ 20] = *(a11 + 12); + b[ 21] = *(a11 + 13); + b[ 22] = *(a12 + 12); + b[ 23] = *(a12 + 13); + b[ 24] = *(a13 + 12); + b[ 25] = *(a13 + 13); + b[ 26] = *(a14 + 12); + b[ 27] = *(a14 + 13); + b[ 28] = *(a15 + 12); + b[ 29] = *(a15 + 13); + b[ 30] = *(a16 + 12); + b[ 31] = *(a16 + 13); + b += 32; + } + + if (i >= 8) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = *(a09 + 14); + b[ 17] = *(a09 + 15); + b[ 18] = *(a10 + 14); + b[ 19] = *(a10 + 15); + b[ 20] = *(a11 + 14); + b[ 21] = *(a11 + 15); + b[ 22] = *(a12 + 14); + b[ 23] = *(a12 + 15); + b[ 24] = *(a13 + 14); + b[ 25] = *(a13 + 15); + b[ 26] = *(a14 + 14); + b[ 27] = *(a14 + 15); + b[ 28] = *(a15 + 14); + b[ 29] = *(a15 + 15); + b[ 30] = *(a16 + 14); + b[ 31] = *(a16 + 15); + b += 32; + } + + if (i >= 9) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = *(a10 + 16); + b[ 19] = *(a10 + 17); + b[ 20] = *(a11 + 16); + b[ 21] = *(a11 + 17); + b[ 22] = *(a12 + 16); + b[ 23] = *(a12 + 17); + b[ 24] = *(a13 + 16); + b[ 25] = *(a13 + 17); + b[ 26] = *(a14 + 16); + b[ 27] = *(a14 + 17); + b[ 28] = *(a15 + 16); + b[ 29] = *(a15 + 17); + b[ 30] = *(a16 + 16); + b[ 31] = *(a16 + 17); + b += 32; + } + + if (i >= 10) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = *(a11 + 18); + b[ 21] = *(a11 + 19); + b[ 22] = *(a12 + 18); + b[ 23] = *(a12 + 19); + b[ 24] = *(a13 + 18); + b[ 25] = *(a13 + 19); + b[ 26] = *(a14 + 18); + b[ 27] = *(a14 + 19); + b[ 28] = *(a15 + 18); + b[ 29] = *(a15 + 19); + b[ 30] = *(a16 + 18); + b[ 31] = *(a16 + 19); + b += 32; + } + + if (i >= 11) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = *(a12 + 20); + b[ 23] = *(a12 + 21); + b[ 24] = *(a13 + 20); + b[ 25] = *(a13 + 21); + b[ 26] = *(a14 + 20); + b[ 27] = *(a14 + 21); + b[ 28] = *(a15 + 20); + b[ 29] = *(a15 + 21); + b[ 30] = *(a16 + 20); + b[ 31] = *(a16 + 21); + b += 32; + } + + if (i >= 12) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = *(a13 + 22); + b[ 25] = *(a13 + 23); + b[ 26] = *(a14 + 22); + b[ 27] = *(a14 + 23); + b[ 28] = *(a15 + 22); + b[ 29] = *(a15 + 23); + b[ 30] = *(a16 + 22); + b[ 31] = *(a16 + 23); + b += 32; + } + + if (i >= 13) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = *(a14 + 24); + b[ 27] = *(a14 + 25); + b[ 28] = *(a15 + 24); + b[ 29] = *(a15 + 25); + b[ 30] = *(a16 + 24); + b[ 31] = *(a16 + 25); + b += 32; + } + + if (i >= 14) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = *(a15 + 26); + b[ 29] = *(a15 + 27); + b[ 30] = *(a16 + 26); + b[ 31] = *(a16 + 27); + b += 32; + } + + if (i >= 15) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = *(a16 + 28); + b[ 31] = *(a16 + 29); + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X > posY) { + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + b += 128; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = *(a03 + 2); + b[ 21] = *(a03 + 3); + b[ 22] = *(a04 + 2); + b[ 23] = *(a04 + 3); + b[ 24] = *(a05 + 2); + b[ 25] = *(a05 + 3); + b[ 26] = *(a06 + 2); + b[ 27] = *(a06 + 3); + b[ 28] = *(a07 + 2); + b[ 29] = *(a07 + 3); + b[ 30] = *(a08 + 2); + b[ 31] = *(a08 + 3); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = *(a04 + 4); + b[ 39] = *(a04 + 5); + b[ 40] = *(a05 + 4); + b[ 41] = *(a05 + 5); + b[ 42] = *(a06 + 4); + b[ 43] = *(a06 + 5); + b[ 44] = *(a07 + 4); + b[ 45] = *(a07 + 5); + b[ 46] = *(a08 + 4); + b[ 47] = *(a08 + 5); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = *(a05 + 6); + b[ 57] = *(a05 + 7); + b[ 58] = *(a06 + 6); + b[ 59] = *(a06 + 7); + b[ 60] = *(a07 + 6); + b[ 61] = *(a07 + 7); + b[ 62] = *(a08 + 6); + b[ 63] = *(a08 + 7); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; + b[ 68] = ZERO; + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = *(a06 + 8); + b[ 75] = *(a06 + 9); + b[ 76] = *(a07 + 8); + b[ 77] = *(a07 + 9); + b[ 78] = *(a08 + 8); + b[ 79] = *(a08 + 9); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = *(a07 + 10); + b[ 93] = *(a07 + 11); + b[ 94] = *(a08 + 10); + b[ 95] = *(a08 + 11); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; + b[102] = ZERO; + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = *(a08 + 12); + b[111] = *(a08 + 13); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[ 10] = *(a06 + 0); + b[ 11] = *(a06 + 1); + b[ 12] = *(a07 + 0); + b[ 13] = *(a07 + 1); + b[ 14] = *(a08 + 0); + b[ 15] = *(a08 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + a05 += 2; + a06 += 2; + a07 += 2; + a08 += 2; + b += 16; + } + } else + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; */ + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b[ 8] = *(a05 + 0); + b[ 9] = *(a05 + 1); + b[10] = *(a06 + 0); + b[11] = *(a06 + 1); + b[12] = *(a07 + 0); + b[13] = *(a07 + 1); + b[14] = *(a08 + 0); + b[15] = *(a08 + 1); + b += 16; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a03 + 2); + b[ 5] = *(a03 + 3); + b[ 6] = *(a04 + 2); + b[ 7] = *(a04 + 3); + b[ 8] = *(a05 + 2); + b[ 9] = *(a05 + 3); + b[10] = *(a06 + 2); + b[11] = *(a06 + 3); + b[12] = *(a07 + 2); + b[13] = *(a07 + 3); + b[14] = *(a08 + 2); + b[15] = *(a08 + 3); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a04 + 4); + b[ 7] = *(a04 + 5); + b[ 8] = *(a05 + 4); + b[ 9] = *(a05 + 5); + b[10] = *(a06 + 4); + b[11] = *(a06 + 5); + b[12] = *(a07 + 4); + b[13] = *(a07 + 5); + b[14] = *(a08 + 4); + b[15] = *(a08 + 5); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = *(a05 + 6); + b[ 9] = *(a05 + 7); + b[10] = *(a06 + 6); + b[11] = *(a06 + 7); + b[12] = *(a07 + 6); + b[13] = *(a07 + 7); + b[14] = *(a08 + 6); + b[15] = *(a08 + 7); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = *(a06 + 8); + b[11] = *(a06 + 9); + b[12] = *(a07 + 8); + b[13] = *(a07 + 9); + b[14] = *(a08 + 8); + b[15] = *(a08 + 9); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = *(a07 + 10); + b[13] = *(a07 + 11); + b[14] = *(a08 + 10); + b[15] = *(a08 + 11); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = *(a08 + 12); + b[15] = *(a08 + 13); + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X > posY) { + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = *(a03 + 2); + b[ 13] = *(a03 + 3); + b[ 14] = *(a04 + 2); + b[ 15] = *(a04 + 3); + + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = *(a04 + 4); + b[ 23] = *(a04 + 5); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + + a01 += 2; + a02 += 2; + a03 += 2; + a04 += 2; + b += 8; + } + } else + if (X > posY) { + /* a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; */ + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a03 + 0); + b[ 5] = *(a03 + 1); + b[ 6] = *(a04 + 0); + b[ 7] = *(a04 + 1); + b += 8; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a03 + 2); + b[ 5] = *(a03 + 3); + b[ 6] = *(a04 + 2); + b[ 7] = *(a04 + 3); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a04 + 4); + b[ 7] = *(a04 + 5); + b += 8; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b[ 4] = *(a01 + 2); + b[ 5] = *(a01 + 3); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 4; + a02 += 4; + b += 8; + } else + if (X > posY) { + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + /* a01 += 2; + a02 += 2; */ + b += 4; + } else + if (X > posY) { + /* a01 += 2 * lda; + a02 += 2 * lda; */ + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += 2; + b += 2; + } else + if (X > posY) { + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + a01 += lda; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_utcopy_16.c b/kernel/generic/ztrmm_utcopy_16.c new file mode 100644 index 000000000..5aba3727a --- /dev/null +++ b/kernel/generic/ztrmm_utcopy_16.c @@ -0,0 +1,2318 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + lda += lda; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + a09 = a + posX * 2 + (posY + 8) * lda; + a10 = a + posX * 2 + (posY + 9) * lda; + a11 = a + posX * 2 + (posY + 10) * lda; + a12 = a + posX * 2 + (posY + 11) * lda; + a13 = a + posX * 2 + (posY + 12) * lda; + a14 = a + posX * 2 + (posY + 13) * lda; + a15 = a + posX * 2 + (posY + 14) * lda; + a16 = a + posX * 2 + (posY + 15) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + a09 = a + posY * 2 + (posX + 8) * lda; + a10 = a + posY * 2 + (posX + 9) * lda; + a11 = a + posY * 2 + (posX + 10) * lda; + a12 = a + posY * 2 + (posX + 11) * lda; + a13 = a + posY * 2 + (posX + 12) * lda; + a14 = a + posY * 2 + (posX + 13) * lda; + a15 = a + posY * 2 + (posX + 14) * lda; + a16 = a + posY * 2 + (posX + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X < posY) { + a01 += 32; + a02 += 32; + a03 += 32; + a04 += 32; + a05 += 32; + a06 += 32; + a07 += 32; + a08 += 32; + a09 += 32; + a10 += 32; + a11 += 32; + a12 += 32; + a13 += 32; + a14 += 32; + a15 += 32; + a16 += 32; + + b += 512; + } else + if (X > posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + b += 32; + } + + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a02 + 0); + b[ 33] = *(a02 + 1); +#ifdef UNIT + b[ 34] = ONE; + b[ 35] = ZERO; +#else + b[ 34] = *(a02 + 2); + b[ 35] = *(a02 + 3); +#endif + b[ 36] = ZERO; + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; + b[ 54] = ZERO; + b[ 55] = ZERO; + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a03 + 0); + b[ 65] = *(a03 + 1); + b[ 66] = *(a03 + 2); + b[ 67] = *(a03 + 3); +#ifdef UNIT + b[ 68] = ONE; + b[ 69] = ZERO; +#else + b[ 68] = *(a03 + 4); + b[ 69] = *(a03 + 5); +#endif + b[ 70] = ZERO; + b[ 71] = ZERO; + b[ 72] = ZERO; + b[ 73] = ZERO; + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; + b[ 90] = ZERO; + b[ 91] = ZERO; + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a04 + 0); + b[ 97] = *(a04 + 1); + b[ 98] = *(a04 + 2); + b[ 99] = *(a04 + 3); + b[100] = *(a04 + 4); + b[101] = *(a04 + 5); +#ifdef UNIT + b[102] = ONE; + b[103] = ZERO; +#else + b[102] = *(a04 + 6); + b[103] = *(a04 + 7); +#endif + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; + b[108] = ZERO; + b[109] = ZERO; + b[110] = ZERO; + b[111] = ZERO; + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; + b[126] = ZERO; + b[127] = ZERO; + + b[128] = *(a05 + 0); + b[129] = *(a05 + 1); + b[130] = *(a05 + 2); + b[131] = *(a05 + 3); + b[132] = *(a05 + 4); + b[133] = *(a05 + 5); + b[134] = *(a05 + 6); + b[135] = *(a05 + 7); +#ifdef UNIT + b[136] = ONE; + b[137] = ZERO; +#else + b[136] = *(a05 + 8); + b[137] = *(a05 + 9); +#endif + b[138] = ZERO; + b[139] = ZERO; + b[140] = ZERO; + b[141] = ZERO; + b[142] = ZERO; + b[143] = ZERO; + b[144] = ZERO; + b[145] = ZERO; + b[146] = ZERO; + b[147] = ZERO; + b[148] = ZERO; + b[149] = ZERO; + b[150] = ZERO; + b[151] = ZERO; + b[152] = ZERO; + b[153] = ZERO; + b[154] = ZERO; + b[155] = ZERO; + b[156] = ZERO; + b[157] = ZERO; + b[158] = ZERO; + b[159] = ZERO; + + b[160] = *(a06 + 0); + b[161] = *(a06 + 1); + b[162] = *(a06 + 2); + b[163] = *(a06 + 3); + b[164] = *(a06 + 4); + b[165] = *(a06 + 5); + b[166] = *(a06 + 6); + b[167] = *(a06 + 7); + b[168] = *(a06 + 8); + b[169] = *(a06 + 9); +#ifdef UNIT + b[170] = ONE; + b[171] = ZERO; +#else + b[170] = *(a06 + 10); + b[171] = *(a06 + 11); +#endif + b[172] = ZERO; + b[173] = ZERO; + b[174] = ZERO; + b[175] = ZERO; + b[176] = ZERO; + b[177] = ZERO; + b[178] = ZERO; + b[179] = ZERO; + b[180] = ZERO; + b[181] = ZERO; + b[182] = ZERO; + b[183] = ZERO; + b[184] = ZERO; + b[185] = ZERO; + b[186] = ZERO; + b[187] = ZERO; + b[188] = ZERO; + b[189] = ZERO; + b[190] = ZERO; + b[191] = ZERO; + + b[192] = *(a07 + 0); + b[193] = *(a07 + 1); + b[194] = *(a07 + 2); + b[195] = *(a07 + 3); + b[196] = *(a07 + 4); + b[197] = *(a07 + 5); + b[198] = *(a07 + 6); + b[199] = *(a07 + 7); + b[200] = *(a07 + 8); + b[201] = *(a07 + 9); + b[202] = *(a07 + 10); + b[203] = *(a07 + 11); +#ifdef UNIT + b[204] = ONE; + b[205] = ZERO; +#else + b[204] = *(a07 + 12); + b[205] = *(a07 + 13); +#endif + b[206] = ZERO; + b[207] = ZERO; + b[208] = ZERO; + b[209] = ZERO; + b[210] = ZERO; + b[211] = ZERO; + b[212] = ZERO; + b[213] = ZERO; + b[214] = ZERO; + b[215] = ZERO; + b[216] = ZERO; + b[217] = ZERO; + b[218] = ZERO; + b[219] = ZERO; + b[220] = ZERO; + b[221] = ZERO; + b[222] = ZERO; + b[223] = ZERO; + + b[224] = *(a08 + 0); + b[225] = *(a08 + 1); + b[226] = *(a08 + 2); + b[227] = *(a08 + 3); + b[228] = *(a08 + 4); + b[229] = *(a08 + 5); + b[230] = *(a08 + 6); + b[231] = *(a08 + 7); + b[232] = *(a08 + 8); + b[233] = *(a08 + 9); + b[234] = *(a08 + 10); + b[235] = *(a08 + 11); + b[236] = *(a08 + 12); + b[237] = *(a08 + 13); +#ifdef UNIT + b[238] = ONE; + b[239] = ZERO; +#else + b[238] = *(a08 + 14); + b[239] = *(a08 + 15); +#endif + b[240] = ZERO; + b[241] = ZERO; + b[242] = ZERO; + b[243] = ZERO; + b[244] = ZERO; + b[245] = ZERO; + b[246] = ZERO; + b[247] = ZERO; + b[248] = ZERO; + b[249] = ZERO; + b[250] = ZERO; + b[251] = ZERO; + b[252] = ZERO; + b[253] = ZERO; + b[254] = ZERO; + b[255] = ZERO; + + b[256] = *(a09 + 0); + b[257] = *(a09 + 1); + b[258] = *(a09 + 2); + b[259] = *(a09 + 3); + b[260] = *(a09 + 4); + b[261] = *(a09 + 5); + b[262] = *(a09 + 6); + b[263] = *(a09 + 7); + b[264] = *(a09 + 8); + b[265] = *(a09 + 9); + b[266] = *(a09 + 10); + b[267] = *(a09 + 11); + b[268] = *(a09 + 12); + b[269] = *(a09 + 13); + b[270] = *(a09 + 14); + b[271] = *(a09 + 15); +#ifdef UNIT + b[272] = ONE; + b[273] = ZERO; +#else + b[272] = *(a09 + 16); + b[273] = *(a09 + 17); +#endif + b[274] = ZERO; + b[275] = ZERO; + b[276] = ZERO; + b[277] = ZERO; + b[278] = ZERO; + b[279] = ZERO; + b[280] = ZERO; + b[281] = ZERO; + b[282] = ZERO; + b[283] = ZERO; + b[284] = ZERO; + b[285] = ZERO; + b[286] = ZERO; + b[287] = ZERO; + + b[288] = *(a10 + 0); + b[289] = *(a10 + 1); + b[290] = *(a10 + 2); + b[291] = *(a10 + 3); + b[292] = *(a10 + 4); + b[293] = *(a10 + 5); + b[294] = *(a10 + 6); + b[295] = *(a10 + 7); + b[296] = *(a10 + 8); + b[297] = *(a10 + 9); + b[298] = *(a10 + 10); + b[299] = *(a10 + 11); + b[300] = *(a10 + 12); + b[301] = *(a10 + 13); + b[302] = *(a10 + 14); + b[303] = *(a10 + 15); + b[304] = *(a10 + 16); + b[305] = *(a10 + 17); +#ifdef UNIT + b[306] = ONE; + b[307] = ZERO; +#else + b[306] = *(a10 + 18); + b[307] = *(a10 + 19); +#endif + b[308] = ZERO; + b[309] = ZERO; + b[310] = ZERO; + b[311] = ZERO; + b[312] = ZERO; + b[313] = ZERO; + b[314] = ZERO; + b[315] = ZERO; + b[316] = ZERO; + b[317] = ZERO; + b[318] = ZERO; + b[319] = ZERO; + + b[320] = *(a11 + 0); + b[321] = *(a11 + 1); + b[322] = *(a11 + 2); + b[323] = *(a11 + 3); + b[324] = *(a11 + 4); + b[325] = *(a11 + 5); + b[326] = *(a11 + 6); + b[327] = *(a11 + 7); + b[328] = *(a11 + 8); + b[329] = *(a11 + 9); + b[330] = *(a11 + 10); + b[331] = *(a11 + 11); + b[332] = *(a11 + 12); + b[333] = *(a11 + 13); + b[334] = *(a11 + 14); + b[335] = *(a11 + 15); + b[336] = *(a11 + 16); + b[337] = *(a11 + 17); + b[338] = *(a11 + 18); + b[339] = *(a11 + 19); +#ifdef UNIT + b[340] = ONE; + b[341] = ZERO; +#else + b[340] = *(a11 + 20); + b[341] = *(a11 + 21); +#endif + b[342] = ZERO; + b[343] = ZERO; + b[344] = ZERO; + b[345] = ZERO; + b[346] = ZERO; + b[347] = ZERO; + b[348] = ZERO; + b[349] = ZERO; + b[350] = ZERO; + b[351] = ZERO; + + b[352] = *(a12 + 0); + b[353] = *(a12 + 1); + b[354] = *(a12 + 2); + b[355] = *(a12 + 3); + b[356] = *(a12 + 4); + b[357] = *(a12 + 5); + b[358] = *(a12 + 6); + b[359] = *(a12 + 7); + b[360] = *(a12 + 8); + b[361] = *(a12 + 9); + b[362] = *(a12 + 10); + b[363] = *(a12 + 11); + b[364] = *(a12 + 12); + b[365] = *(a12 + 13); + b[366] = *(a12 + 14); + b[367] = *(a12 + 15); + b[368] = *(a12 + 16); + b[369] = *(a12 + 17); + b[370] = *(a12 + 18); + b[371] = *(a12 + 19); + b[372] = *(a12 + 20); + b[373] = *(a12 + 21); +#ifdef UNIT + b[374] = ONE; + b[375] = ZERO; +#else + b[374] = *(a12 + 22); + b[375] = *(a12 + 23); +#endif + b[376] = ZERO; + b[377] = ZERO; + b[378] = ZERO; + b[379] = ZERO; + b[380] = ZERO; + b[381] = ZERO; + b[382] = ZERO; + b[383] = ZERO; + + b[384] = *(a13 + 0); + b[385] = *(a13 + 1); + b[386] = *(a13 + 2); + b[387] = *(a13 + 3); + b[388] = *(a13 + 4); + b[389] = *(a13 + 5); + b[390] = *(a13 + 6); + b[391] = *(a13 + 7); + b[392] = *(a13 + 8); + b[393] = *(a13 + 9); + b[394] = *(a13 + 10); + b[395] = *(a13 + 11); + b[396] = *(a13 + 12); + b[397] = *(a13 + 13); + b[398] = *(a13 + 14); + b[399] = *(a13 + 15); + b[400] = *(a13 + 16); + b[401] = *(a13 + 17); + b[402] = *(a13 + 18); + b[403] = *(a13 + 19); + b[404] = *(a13 + 20); + b[405] = *(a13 + 21); + b[406] = *(a13 + 22); + b[407] = *(a13 + 23); +#ifdef UNIT + b[408] = ONE; + b[409] = ZERO; +#else + b[408] = *(a13 + 24); + b[409] = *(a13 + 25); +#endif + b[410] = ZERO; + b[411] = ZERO; + b[412] = ZERO; + b[413] = ZERO; + b[414] = ZERO; + b[415] = ZERO; + + b[416] = *(a14 + 0); + b[417] = *(a14 + 1); + b[418] = *(a14 + 2); + b[419] = *(a14 + 3); + b[420] = *(a14 + 4); + b[421] = *(a14 + 5); + b[422] = *(a14 + 6); + b[423] = *(a14 + 7); + b[424] = *(a14 + 8); + b[425] = *(a14 + 9); + b[426] = *(a14 + 10); + b[427] = *(a14 + 11); + b[428] = *(a14 + 12); + b[429] = *(a14 + 13); + b[430] = *(a14 + 14); + b[431] = *(a14 + 15); + b[432] = *(a14 + 16); + b[433] = *(a14 + 17); + b[434] = *(a14 + 18); + b[435] = *(a14 + 19); + b[436] = *(a14 + 20); + b[437] = *(a14 + 21); + b[438] = *(a14 + 22); + b[439] = *(a14 + 23); + b[440] = *(a14 + 24); + b[441] = *(a14 + 25); +#ifdef UNIT + b[442] = ONE; + b[443] = ZERO; +#else + b[442] = *(a14 + 26); + b[443] = *(a14 + 27); +#endif + b[444] = ZERO; + b[445] = ZERO; + b[446] = ZERO; + b[447] = ZERO; + + b[448] = *(a15 + 0); + b[449] = *(a15 + 1); + b[450] = *(a15 + 2); + b[451] = *(a15 + 3); + b[452] = *(a15 + 4); + b[453] = *(a15 + 5); + b[454] = *(a15 + 6); + b[455] = *(a15 + 7); + b[456] = *(a15 + 8); + b[457] = *(a15 + 9); + b[458] = *(a15 + 10); + b[459] = *(a15 + 11); + b[460] = *(a15 + 12); + b[461] = *(a15 + 13); + b[462] = *(a15 + 14); + b[463] = *(a15 + 15); + b[464] = *(a15 + 16); + b[465] = *(a15 + 17); + b[466] = *(a15 + 18); + b[467] = *(a15 + 19); + b[468] = *(a15 + 20); + b[469] = *(a15 + 21); + b[470] = *(a15 + 22); + b[471] = *(a15 + 23); + b[472] = *(a15 + 24); + b[473] = *(a15 + 25); + b[474] = *(a15 + 26); + b[475] = *(a15 + 27); +#ifdef UNIT + b[476] = ONE; + b[477] = ZERO; +#else + b[476] = *(a15 + 28); + b[477] = *(a15 + 29); +#endif + b[478] = ZERO; + b[479] = ZERO; + + b[480] = *(a16 + 0); + b[481] = *(a16 + 1); + b[482] = *(a16 + 2); + b[483] = *(a16 + 3); + b[484] = *(a16 + 4); + b[485] = *(a16 + 5); + b[486] = *(a16 + 6); + b[487] = *(a16 + 7); + b[488] = *(a16 + 8); + b[489] = *(a16 + 9); + b[490] = *(a16 + 10); + b[491] = *(a16 + 11); + b[492] = *(a16 + 12); + b[493] = *(a16 + 13); + b[494] = *(a16 + 14); + b[495] = *(a16 + 15); + b[496] = *(a16 + 16); + b[497] = *(a16 + 17); + b[498] = *(a16 + 18); + b[499] = *(a16 + 19); + b[500] = *(a16 + 20); + b[501] = *(a16 + 21); + b[502] = *(a16 + 22); + b[503] = *(a16 + 23); + b[504] = *(a16 + 24); + b[505] = *(a16 + 25); + b[506] = *(a16 + 26); + b[507] = *(a16 + 27); + b[508] = *(a16 + 28); + b[509] = *(a16 + 29); +#ifdef UNIT + b[510] = ONE; + b[511] = ZERO; +#else + b[510] = *(a16 + 30); + b[511] = *(a16 + 31); +#endif + + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + b += 512; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X < posY) { + // a01 += 2 * i; + // a02 += 2 * i; + // a03 += 2 * i; + // a04 += 2 * i; + // a05 += 2 * i; + // a06 += 2 * i; + // a07 += 2 * i; + // a08 += 2 * i; + // a09 += 2 * i; + // a10 += 2 * i; + // a11 += 2 * i; + // a12 += 2 * i; + // a13 += 2 * i; + // a14 += 2 * i; + // a15 += 2 * i; + // a16 += 2 * i; + b += 32 * i; + + } else + if (X > posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = *(a01 + 16); + b[ 17] = *(a01 + 17); + b[ 18] = *(a01 + 18); + b[ 19] = *(a01 + 19); + b[ 20] = *(a01 + 20); + b[ 21] = *(a01 + 21); + b[ 22] = *(a01 + 22); + b[ 23] = *(a01 + 23); + b[ 24] = *(a01 + 24); + b[ 25] = *(a01 + 25); + b[ 26] = *(a01 + 26); + b[ 27] = *(a01 + 27); + b[ 28] = *(a01 + 28); + b[ 29] = *(a01 + 29); + b[ 30] = *(a01 + 30); + b[ 31] = *(a01 + 31); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + a09 += lda; + a10 += lda; + a11 += lda; + a12 += lda; + a13 += lda; + a14 += lda; + a15 += lda; + a16 += lda; + b += 32; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + + if (i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); + b[ 3] = *(a04 + 3); + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); + b[ 4] = *(a05 + 4); + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); + b[ 5] = *(a06 + 5); + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b[ 8] = *(a06 + 8); + b[ 9] = *(a06 + 9); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); + b[ 6] = *(a07 + 6); + b[ 7] = *(a07 + 7); + b[ 8] = *(a07 + 8); + b[ 9] = *(a07 + 9); + b[10] = *(a07 + 10); + b[11] = *(a07 + 11); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[ 14] = ZERO; + b[ 15] = ZERO; + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 8) { + b[ 0] = *(a08 + 0); + b[ 1] = *(a08 + 1); + b[ 2] = *(a08 + 2); + b[ 3] = *(a08 + 3); + b[ 4] = *(a08 + 4); + b[ 5] = *(a08 + 5); + b[ 6] = *(a08 + 6); + b[ 7] = *(a08 + 7); + b[ 8] = *(a08 + 8); + b[ 9] = *(a08 + 9); + b[ 10] = *(a08 + 10); + b[ 11] = *(a08 + 11); + b[ 12] = *(a08 + 12); + b[ 13] = *(a08 + 13); +#ifdef UNIT + b[ 14] = ONE; + b[ 15] = ZERO; +#else + b[ 14] = *(a08 + 14); + b[ 15] = *(a08 + 15); +#endif + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 9) { + b[ 0] = *(a09 + 0); + b[ 1] = *(a09 + 1); + b[ 2] = *(a09 + 2); + b[ 3] = *(a09 + 3); + b[ 4] = *(a09 + 4); + b[ 5] = *(a09 + 5); + b[ 6] = *(a09 + 6); + b[ 7] = *(a09 + 7); + b[ 8] = *(a09 + 8); + b[ 9] = *(a09 + 9); + b[ 10] = *(a09 + 10); + b[ 11] = *(a09 + 11); + b[ 12] = *(a09 + 12); + b[ 13] = *(a09 + 13); + b[ 14] = *(a09 + 14); + b[ 15] = *(a09 + 15); +#ifdef UNIT + b[ 16] = ONE; + b[ 17] = ZERO; +#else + b[ 16] = *(a09 + 16); + b[ 17] = *(a09 + 17); +#endif + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 10) { + b[ 0] = *(a10 + 0); + b[ 1] = *(a10 + 1); + b[ 2] = *(a10 + 2); + b[ 3] = *(a10 + 3); + b[ 4] = *(a10 + 4); + b[ 5] = *(a10 + 5); + b[ 6] = *(a10 + 6); + b[ 7] = *(a10 + 7); + b[ 8] = *(a10 + 8); + b[ 9] = *(a10 + 9); + b[ 10] = *(a10 + 10); + b[ 11] = *(a10 + 11); + b[ 12] = *(a10 + 12); + b[ 13] = *(a10 + 13); + b[ 14] = *(a10 + 14); + b[ 15] = *(a10 + 15); + b[ 16] = *(a10 + 16); + b[ 17] = *(a10 + 17); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a10 + 18); + b[ 19] = *(a10 + 19); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 11) { + b[ 0] = *(a11 + 0); + b[ 1] = *(a11 + 1); + b[ 2] = *(a11 + 2); + b[ 3] = *(a11 + 3); + b[ 4] = *(a11 + 4); + b[ 5] = *(a11 + 5); + b[ 6] = *(a11 + 6); + b[ 7] = *(a11 + 7); + b[ 8] = *(a11 + 8); + b[ 9] = *(a11 + 9); + b[ 10] = *(a11 + 10); + b[ 11] = *(a11 + 11); + b[ 12] = *(a11 + 12); + b[ 13] = *(a11 + 13); + b[ 14] = *(a11 + 14); + b[ 15] = *(a11 + 15); + b[ 16] = *(a11 + 16); + b[ 17] = *(a11 + 17); + b[ 18] = *(a11 + 18); + b[ 19] = *(a11 + 19); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a11 + 20); + b[ 21] = *(a11 + 21); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 12) { + b[ 0] = *(a12 + 0); + b[ 1] = *(a12 + 1); + b[ 2] = *(a12 + 2); + b[ 3] = *(a12 + 3); + b[ 4] = *(a12 + 4); + b[ 5] = *(a12 + 5); + b[ 6] = *(a12 + 6); + b[ 7] = *(a12 + 7); + b[ 8] = *(a12 + 8); + b[ 9] = *(a12 + 9); + b[ 10] = *(a12 + 10); + b[ 11] = *(a12 + 11); + b[ 12] = *(a12 + 12); + b[ 13] = *(a12 + 13); + b[ 14] = *(a12 + 14); + b[ 15] = *(a12 + 15); + b[ 16] = *(a12 + 16); + b[ 17] = *(a12 + 17); + b[ 18] = *(a12 + 18); + b[ 19] = *(a12 + 19); + b[ 20] = *(a12 + 20); + b[ 21] = *(a12 + 21); +#ifdef UNIT + b[ 22] = ONE; + b[ 23] = ZERO; +#else + b[ 22] = *(a12 + 22); + b[ 23] = *(a12 + 23); +#endif + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 13) { + b[ 0] = *(a13 + 0); + b[ 1] = *(a13 + 1); + b[ 2] = *(a13 + 2); + b[ 3] = *(a13 + 3); + b[ 4] = *(a13 + 4); + b[ 5] = *(a13 + 5); + b[ 6] = *(a13 + 6); + b[ 7] = *(a13 + 7); + b[ 8] = *(a13 + 8); + b[ 9] = *(a13 + 9); + b[ 10] = *(a13 + 10); + b[ 11] = *(a13 + 11); + b[ 12] = *(a13 + 12); + b[ 13] = *(a13 + 13); + b[ 14] = *(a13 + 14); + b[ 15] = *(a13 + 15); + b[ 16] = *(a13 + 16); + b[ 17] = *(a13 + 17); + b[ 18] = *(a13 + 18); + b[ 19] = *(a13 + 19); + b[ 20] = *(a13 + 20); + b[ 21] = *(a13 + 21); + b[ 22] = *(a13 + 22); + b[ 23] = *(a13 + 23); +#ifdef UNIT + b[ 24] = ONE; + b[ 25] = ZERO; +#else + b[ 24] = *(a13 + 24); + b[ 25] = *(a13 + 25); +#endif + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 14) { + b[ 0] = *(a14 + 0); + b[ 1] = *(a14 + 1); + b[ 2] = *(a14 + 2); + b[ 3] = *(a14 + 3); + b[ 4] = *(a14 + 4); + b[ 5] = *(a14 + 5); + b[ 6] = *(a14 + 6); + b[ 7] = *(a14 + 7); + b[ 8] = *(a14 + 8); + b[ 9] = *(a14 + 9); + b[ 10] = *(a14 + 10); + b[ 11] = *(a14 + 11); + b[ 12] = *(a14 + 12); + b[ 13] = *(a14 + 13); + b[ 14] = *(a14 + 14); + b[ 15] = *(a14 + 15); + b[ 16] = *(a14 + 16); + b[ 17] = *(a14 + 17); + b[ 18] = *(a14 + 18); + b[ 19] = *(a14 + 19); + b[ 20] = *(a14 + 20); + b[ 21] = *(a14 + 21); + b[ 22] = *(a14 + 22); + b[ 23] = *(a14 + 23); + b[ 24] = *(a14 + 24); + b[ 25] = *(a14 + 25); +#ifdef UNIT + b[ 26] = ONE; + b[ 27] = ZERO; +#else + b[ 26] = *(a14 + 26); + b[ 27] = *(a14 + 27); +#endif + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + + if (i >= 15) { + b[ 0] = *(a15 + 0); + b[ 1] = *(a15 + 1); + b[ 2] = *(a15 + 2); + b[ 3] = *(a15 + 3); + b[ 4] = *(a15 + 4); + b[ 5] = *(a15 + 5); + b[ 6] = *(a15 + 6); + b[ 7] = *(a15 + 7); + b[ 8] = *(a15 + 8); + b[ 9] = *(a15 + 9); + b[ 10] = *(a15 + 10); + b[ 11] = *(a15 + 11); + b[ 12] = *(a15 + 12); + b[ 13] = *(a15 + 13); + b[ 14] = *(a15 + 14); + b[ 15] = *(a15 + 15); + b[ 16] = *(a15 + 16); + b[ 17] = *(a15 + 17); + b[ 18] = *(a15 + 18); + b[ 19] = *(a15 + 19); + b[ 20] = *(a15 + 20); + b[ 21] = *(a15 + 21); + b[ 22] = *(a15 + 22); + b[ 23] = *(a15 + 23); + b[ 24] = *(a15 + 24); + b[ 25] = *(a15 + 25); + b[ 26] = *(a15 + 26); + b[ 27] = *(a15 + 27); +#ifdef UNIT + b[ 28] = ONE; + b[ 29] = ZERO; +#else + b[ 28] = *(a15 + 28); + b[ 29] = *(a15 + 29); +#endif + b[ 30] = ZERO; + b[ 31] = ZERO; + b += 32; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } else + if (X > posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a02 + 0); + b[ 17] = *(a02 + 1); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a03 + 0); + b[ 33] = *(a03 + 1); + b[ 34] = *(a03 + 2); + b[ 35] = *(a03 + 3); +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a04 + 0); + b[ 49] = *(a04 + 1); + b[ 50] = *(a04 + 2); + b[ 51] = *(a04 + 3); + b[ 52] = *(a04 + 4); + b[ 53] = *(a04 + 5); +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a05 + 0); + b[ 65] = *(a05 + 1); + b[ 66] = *(a05 + 2); + b[ 67] = *(a05 + 3); + b[ 68] = *(a05 + 4); + b[ 69] = *(a05 + 5); + b[ 70] = *(a05 + 6); + b[ 71] = *(a05 + 7); +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a06 + 0); + b[ 81] = *(a06 + 1); + b[ 82] = *(a06 + 2); + b[ 83] = *(a06 + 3); + b[ 84] = *(a06 + 4); + b[ 85] = *(a06 + 5); + b[ 86] = *(a06 + 6); + b[ 87] = *(a06 + 7); + b[ 88] = *(a06 + 8); + b[ 89] = *(a06 + 9); +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a07 + 0); + b[ 97] = *(a07 + 1); + b[ 98] = *(a07 + 2); + b[ 99] = *(a07 + 3); + b[100] = *(a07 + 4); + b[101] = *(a07 + 5); + b[102] = *(a07 + 6); + b[103] = *(a07 + 7); + b[104] = *(a07 + 8); + b[105] = *(a07 + 9); + b[106] = *(a07 + 10); + b[107] = *(a07 + 11); +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a08 + 0); + b[113] = *(a08 + 1); + b[114] = *(a08 + 2); + b[115] = *(a08 + 3); + b[116] = *(a08 + 4); + b[117] = *(a08 + 5); + b[118] = *(a08 + 6); + b[119] = *(a08 + 7); + b[120] = *(a08 + 8); + b[121] = *(a08 + 9); + b[122] = *(a08 + 10); + b[123] = *(a08 + 11); + b[124] = *(a08 + 12); + b[125] = *(a08 + 13); +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + a05 += 2 * i; + a06 += 2 * i; + a07 += 2 * i; + a08 += 2 * i; */ + b += 16 * i; + } else + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 16; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + + if(i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); + b[ 3] = *(a04 + 3); + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); + b[ 4] = *(a05 + 4); + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); + b[ 5] = *(a06 + 5); + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b[ 8] = *(a06 + 8); + b[ 9] = *(a06 + 9); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); + b[ 6] = *(a07 + 6); + b[ 7] = *(a07 + 7); + b[ 8] = *(a07 + 8); + b[ 9] = *(a07 + 9); + b[10] = *(a07 + 10); + b[11] = *(a07 + 11); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } else + if (X > posY) { + + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a02 + 0); + b[ 9] = *(a02 + 1); +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a03 + 0); + b[ 17] = *(a03 + 1); + b[ 18] = *(a03 + 2); + b[ 19] = *(a03 + 3); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a04 + 0); + b[ 25] = *(a04 + 1); + b[ 26] = *(a04 + 2); + b[ 27] = *(a04 + 3); + b[ 28] = *(a04 + 4); + b[ 29] = *(a04 + 5); +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + /* a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; */ + b += 8 * i; + } else + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 8; + } + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if(i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 4; + } + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + a01 += 4; + a02 += 4; + b += 8; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a02 + 0); + b[ 5] = *(a02 + 1); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a02 + 0); + b[ 5] = *(a02 + 1); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + b += 4; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + } +#if 1 + } +#else + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b += 4; + } +#endif + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + a01 += 2; + } else { +#ifdef UNIT + if (X > posY) { +#endif + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#ifdef UNIT + } else { + b[ 0] = ONE; + b[ 1] = ZERO; + } +#endif + a01 += lda; + } + b += 2; + X ++; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_lncopy_16.c b/kernel/generic/ztrsm_lncopy_16.c new file mode 100644 index 000000000..4fd72c13e --- /dev/null +++ b/kernel/generic/ztrsm_lncopy_16.c @@ -0,0 +1,308 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a9 = a + 8 * lda; + a10 = a + 9 * lda; + a11 = a + 10 * lda; + a12 = a + 11 * lda; + a13 = a + 12 * lda; + a14 = a + 13 * lda; + a15 = a + 14 * lda; + a16 = a + 15 * lda; + + a += 16 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 16) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + *(b + 16) = *(a9 + 0); + *(b + 17) = *(a9 + 1); + *(b + 18) = *(a10 + 0); + *(b + 19) = *(a10 + 1); + *(b + 20) = *(a11 + 0); + *(b + 21) = *(a11 + 1); + *(b + 22) = *(a12 + 0); + *(b + 23) = *(a12 + 1); + *(b + 24) = *(a13 + 0); + *(b + 25) = *(a13 + 1); + *(b + 26) = *(a14 + 0); + *(b + 27) = *(a14 + 1); + *(b + 28) = *(a15 + 0); + *(b + 29) = *(a15 + 1); + *(b + 30) = *(a16 + 0); + *(b + 31) = *(a16 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + a9 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + ii ++; + } + + jj += 16; + j --; + } + + if (n & 8) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii ++; + } + + jj += 8; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + } + + a1 += 2; + a2 += 2; + b += 4; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + a1 += 2; + b += 2; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_ltcopy_16.c b/kernel/generic/ztrsm_ltcopy_16.c new file mode 100644 index 000000000..e9aeae1ad --- /dev/null +++ b/kernel/generic/ztrsm_ltcopy_16.c @@ -0,0 +1,264 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1; + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a; + a += 32; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 16; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + *(b + 16) = *(a1 + 16); + *(b + 17) = *(a1 + 17); + *(b + 18) = *(a1 + 18); + *(b + 19) = *(a1 + 19); + *(b + 20) = *(a1 + 20); + *(b + 21) = *(a1 + 21); + *(b + 22) = *(a1 + 22); + *(b + 23) = *(a1 + 23); + *(b + 24) = *(a1 + 24); + *(b + 25) = *(a1 + 25); + *(b + 26) = *(a1 + 26); + *(b + 27) = *(a1 + 27); + *(b + 28) = *(a1 + 28); + *(b + 29) = *(a1 + 29); + *(b + 30) = *(a1 + 30); + *(b + 31) = *(a1 + 31); + } + + b += 32; + a1 += lda; + ii ++; + } + + jj += 16; + j --; + } + + j = (n & 8); + if (j > 0) { + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 8; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_uncopy_16.c b/kernel/generic/ztrsm_uncopy_16.c new file mode 100644 index 000000000..e84d96891 --- /dev/null +++ b/kernel/generic/ztrsm_uncopy_16.c @@ -0,0 +1,313 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a9 = a + 8 * lda; + a10 = a + 9 * lda; + a11 = a + 10 * lda; + a12 = a + 11 * lda; + a13 = a + 12 * lda; + a14 = a + 13 * lda; + a15 = a + 14 * lda; + a16 = a + 15 * lda; + + a += 16 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 16; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + *(b + 16) = *(a9 + 0); + *(b + 17) = *(a9 + 1); + *(b + 18) = *(a10 + 0); + *(b + 19) = *(a10 + 1); + *(b + 20) = *(a11 + 0); + *(b + 21) = *(a11 + 1); + *(b + 22) = *(a12 + 0); + *(b + 23) = *(a12 + 1); + *(b + 24) = *(a13 + 0); + *(b + 25) = *(a13 + 1); + *(b + 26) = *(a14 + 0); + *(b + 27) = *(a14 + 1); + *(b + 28) = *(a15 + 0); + *(b + 29) = *(a15 + 1); + *(b + 30) = *(a16 + 0); + *(b + 31) = *(a16 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + a9 += 2; + a10 += 2; + a11 += 2; + a12 += 2; + a13 += 2; + a14 += 2; + a15 += 2; + a16 += 2; + b += 32; + ii ++; + } + + jj += 16; + j --; + } + + if (n & 8) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii ++; + } + + jj += 8; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + } + + a1 += 2; + a2 += 2; + b += 4; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + for (k = ii - jj + 1; k < 1; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + a1 += 2; + b += 2; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_utcopy_16.c b/kernel/generic/ztrsm_utcopy_16.c new file mode 100644 index 000000000..efcea5c3f --- /dev/null +++ b/kernel/generic/ztrsm_utcopy_16.c @@ -0,0 +1,261 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, data1, data2; + + lda *= 2; + + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a; + a += 32; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 16) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + *(b + 16) = *(a1 + 16); + *(b + 17) = *(a1 + 17); + *(b + 18) = *(a1 + 18); + *(b + 19) = *(a1 + 19); + *(b + 20) = *(a1 + 20); + *(b + 21) = *(a1 + 21); + *(b + 22) = *(a1 + 22); + *(b + 23) = *(a1 + 23); + *(b + 24) = *(a1 + 24); + *(b + 25) = *(a1 + 25); + *(b + 26) = *(a1 + 26); + *(b + 27) = *(a1 + 27); + *(b + 28) = *(a1 + 28); + *(b + 29) = *(a1 + 29); + *(b + 30) = *(a1 + 30); + *(b + 31) = *(a1 + 31); + } + + b += 32; + a1 += lda; + ii ++; + } + + jj += 16; + j --; + } + + j = (n & 8); + if (j > 0) { + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 8; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index 17d15656a..ce9268b93 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -111,9 +111,13 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMVNKERNEL = sgemv_n_8_lasx.S SGEMVTKERNEL = sgemv_t_8_lasx.S -CGEMMKERNEL = cgemm_kernel_2x2_lsx.S -CGEMMONCOPY = cgemm_ncopy_2_lsx.S -CGEMMOTCOPY = cgemm_tcopy_2_lsx.S +CGEMMKERNEL = cgemm_kernel_16x4_lasx.S +CGEMMINCOPY = cgemm_ncopy_16_lasx.S +CGEMMITCOPY = cgemm_tcopy_16_lasx.S +CGEMMONCOPY = cgemm_ncopy_4_lasx.S +CGEMMOTCOPY = cgemm_tcopy_4_lasx.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/loongarch64/cgemm_kernel_16x4_lasx.S b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S new file mode 100644 index 000000000..249abe102 --- /dev/null +++ b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S @@ -0,0 +1,3757 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA_R $f0 // param 4: alphar +#define ALPHA_I $f1 // param 5: alphai +#define A $r7 // param 6: ba +#define B $r8 // param 7: bb +#define C $r9 // param 8: bc +#define LDC $r10 // param 9: ldc + +#if defined (TRMMKERNEL) +#define OFFSET $r11 // param 10: offset +#endif +#define OFF $r26 + +#define I $r12 +#define J $r13 +#define L $r14 +#define TL $r15 +#define A0 $r16 +#define B0 $r17 +#define C0 $r18 +#define C1 $r19 +#define C2 $r20 +#define C3 $r23 +#define T0 $r24 +#define T1 $r25 +#define T2 $r26 +#define T3 $r27 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 +#define b1 $f10 +#define b2 $f11 +#define b3 $f12 +#define b4 $f13 +#define b5 $f14 +#define b6 $f15 +#define b7 $f16 +#define b8 $f17 +#define c11 $f18 +#define c12 $f19 +#define c21 $f20 +#define c22 $f21 +#define c31 $f22 +#define c32 $f23 +#define c41 $f24 +#define c42 $f25 + +/* LASX vectors */ +#define U0 $xr30 +#define U1 $xr31 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define VALPHAR $xr28 +#define VALPHAI $xr29 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVNMSUB +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VNMSUB +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVFMADD +#define XVMADD3 XVFMADD +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VFMADD +#define VMADD3 VFMADD +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVFMADD +#define XVMADD4 XVFMADD + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VFMADD +#define VMADD4 VFMADD + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define XVMADD1 XVFMADD +#define XVMADD2 XVNMSUB +#define XVMADD3 XVNMSUB +#define XVMADD4 XVNMSUB + +#define VMADD1 VFMADD +#define VMADD2 VNMSUB +#define VMADD3 VNMSUB +#define VMADD4 VNMSUB + +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA_R,$sp, 112 + ST ALPHA_I,$sp, 120 + + xvldrepl.w VALPHAR, $sp, 112 + xvldrepl.w VALPHAI, $sp, 120 + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, $r0, OFFSET +#else + xor OFF, OFF, OFF +#endif + + slli.d LDC, LDC, 2 + + move J, $r0 + srai.d T0, N, 2 //bn/4 + beq J, T0, .L19 + +.L10: /* for(j=0; j0) */ + xvld U0, S1, 0x00 //1 2 3 4 5 6 7 8 + xvld U1, S2, 0x00 //9 10 11 12 13 14 15 16 + + xvand.v D0, U0, U0 + xvand.v D1, U1, U1 + + xvshuf4i.d D0, U1, 0x88 //1 2 9 10 5 6 13 14 + xvshuf4i.d D1, U0, 0x77 //3 4 11 12 7 8 15 16 + + xvand.v U4, D0, D0 + + xvpermi.q U4, D1, 0x02 //1 2 9 10 3 4 11 12 + xvpermi.q D1, D0, 0x31 //5 6 13 14 7 8 15 16 + + xvst U4, TD, 0x00 + xvst D1, TD, 0x20 + + addi.d S1, S1, 0x20 // a_offset + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N11 + +.L_N10: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N130 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vand.v $vr8, $vr1, $vr1 + + vpermi.w $vr8, $vr0, 0x44 + vpermi.w $vr1, $vr0, 0xee + + vst $vr8, TD, 0x00 + vst $vr1, TD, 0x10 + + addi.d S1, S1, 0x10 // a_offset + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 // b_offset + +.L_N130: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N20 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0c + + addi.d TD, TD, 0x10 + +.L_N20: /* if(n&1) */ + andi I, N, 0x01 + beq I, ZERO, .L_N00 + + move S1, TS + srai.d I, M, 0x02 + + beq I, ZERO, .L_N30 + +.L_N21: /* if(i>0) */ + xvld U0, S1, 0x00 + + xvst U0, TD, 0x00 + + addi.d S1, S1, 0x20 // aoffset1 + addi.d TD, TD, 0x20 // b_offset + + addi.d I, I, -1 + blt ZERO, I, .L_N21 + +.L_N30: /* if(m&2) */ + andi I, M, 0x02 + beq I, ZERO, .L_N330 + + vld $vr0, S1, 0x00 + + vst $vr0, TD, 0x00 + + addi.d S1, S1, 0x10 // aoffset1 + addi.d TD, TD, 0x10 // b_offset + +.L_N330: /* if(m&1) */ + andi I, M, 0x01 + beq I, ZERO, .L_N00 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + +.L_N00: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_16_lasx.S b/kernel/loongarch64/cgemm_tcopy_16_lasx.S new file mode 100644 index 000000000..7d9eb94c8 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_16_lasx.S @@ -0,0 +1,741 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +#define F8 $f8 +#define F9 $f9 +#define F10 $f10 +#define F11 $f11 +#define F12 $f12 +#define F13 $f13 +#define F14 $f14 +#define F15 $f15 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + + PROLOGUE + + addi.d $sp, $sp, -56 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x04 + srai.d T1, N, 0x03 + slli.d T0, T0, 0x04 + slli.d T1, T1, 0x03 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x02 + srai.d T1, N, 0x01 + slli.d T0, T0, 0x02 + slli.d T1, T1, 0x01 + mul.d P4, M, T0 + mul.d P5, M, T1 + slli.d P4, P4, 0x03 + slli.d P5, P5, 0x03 + add.d P4, DST, P4 + add.d P5, DST, P5 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x07 + beq ZERO, J, .L_M7 + +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x400 + + srai.d I, N, 0x04 + addi.d J, J, -1 + beq ZERO, I, .L_N15 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S5, 0x40 + xvld U3, S5, 0x60 + xvld U4, S6, 0x00 + xvld U5, S6, 0x20 + xvld U6, S6, 0x40 + xvld U7, S6, 0x60 + + xvst U0, P1, 0x200 + xvst U1, P1, 0x220 + xvst U2, P1, 0x240 + xvst U3, P1, 0x260 + xvst U4, P1, 0x280 + xvst U5, P1, 0x2A0 + xvst U6, P1, 0x2C0 + xvst U7, P1, 0x2E0 + + xvld U0, S7, 0x00 + xvld U1, S7, 0x20 + xvld U2, S7, 0x40 + xvld U3, S7, 0x60 + xvld U4, S8, 0x00 + xvld U5, S8, 0x20 + xvld U6, S8, 0x40 + xvld U7, S8, 0x60 + + xvst U0, P1, 0x300 + xvst U1, P1, 0x320 + xvst U2, P1, 0x340 + xvst U3, P1, 0x360 + xvst U4, P1, 0x380 + xvst U5, P1, 0x3A0 + xvst U6, P1, 0x3C0 + xvst U7, P1, 0x3E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d S5, S5, 0x80 + addi.d S6, S6, 0x80 + addi.d S7, S7, 0x80 + addi.d S8, S8, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 + +.L_N15: + andi I, N, 0x08 + beq ZERO, I, .L_N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S6, 0x00 + xvld U3, S6, 0x20 + xvld U4, S7, 0x00 + xvld U5, S7, 0x20 + xvld U6, S8, 0x00 + xvld U7, S8, 0x20 + + xvst U0, P2, 0x100 + xvst U1, P2, 0x120 + xvst U2, P2, 0x140 + xvst U3, P2, 0x160 + xvst U4, P2, 0x180 + xvst U5, P2, 0x1A0 + xvst U6, P2, 0x1C0 + xvst U7, P2, 0x1E0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d P2, P2, 0x200 + +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + xvst U4, P3, 0x80 + xvst U5, P3, 0xA0 + xvst U6, P3, 0xC0 + xvst U7, P3, 0xE0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P3, P3, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + xvpermi.q U4, U5, 0x02 + xvpermi.q U6, U7, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + xvst U4, P4, 0x40 + xvst U6, P4, 0x60 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P4, P4, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fld.s F8, S5, 0x00 + fld.s F9, S5, 0x04 + + fld.s F10, S6, 0x00 + fld.s F11, S6, 0x04 + + fld.s F12, S7, 0x00 + fld.s F13, S7, 0x04 + + fld.s F14, S8, 0x00 + fld.s F15, S8, 0x04 + + fst.s F0, P5, 0x00 + fst.s F1, P5, 0x04 + fst.s F2, P5, 0x08 + fst.s F3, P5, 0x0c + fst.s F4, P5, 0x10 + fst.s F5, P5, 0x14 + fst.s F6, P5, 0x18 + fst.s F7, P5, 0x1c + fst.s F8, P5, 0x20 + fst.s F9, P5, 0x24 + fst.s F10, P5, 0x28 + fst.s F11, P5, 0x2c + fst.s F12, P5, 0x30 + fst.s F13, P5, 0x34 + fst.s F14, P5, 0x38 + fst.s F15, P5, 0x3c + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P5, P5, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x04 + beq ZERO, I, .L_4N15 + +.L_4I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 + +.L_4N15: + andi I, N, 0x08 + beq ZERO, I, .L_4N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d P2, P2, 0x100 + +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P3, P3, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P4, P4, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + fst.d F2, P5, 0x10 + fst.d F3, P5, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P5, P5, 0x20 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x04 + beq ZERO, I, .L_2N15 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 + +.L_2N15: + andi I, N, 0x08 + beq ZERO, I, .L_2N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d P2, P2, 0x80 + +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P3, P3, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpermi.q U0, U1, 0x02 + + xvst U0, P4, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P4, P4, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P5, P5, 0x10 + +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x04 + beq ZERO, I, .L_1N15 + +.L_1I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + addi.d S1, S1, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N15: + andi I, N, 0x08 + beq ZERO, I, .L_1N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + + addi.d S1, S1, 0x40 + addi.d P2, P2, 0x40 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + xvld U0, S1, 0x00 + + xvst U0, P3, 0x00 + + addi.d S1, S1, 0x20 + addi.d P3, P3, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x10 + addi.d P4, P4, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P5, 0x00 + + addi.d S1, S1, 0x08 + addi.d P5, P5, 0x08 + +.L_M0: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + addi.d $sp, $sp, 56 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/kernel/loongarch64/cgemm_tcopy_4_lasx.S b/kernel/loongarch64/cgemm_tcopy_4_lasx.S new file mode 100644 index 000000000..9ff8a35b8 --- /dev/null +++ b/kernel/loongarch64/cgemm_tcopy_4_lasx.S @@ -0,0 +1,306 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define TD $r16 +#define TS $r17 +#define TL $r18 +#define T0 $r19 +#define S8 $r20 +#define S9 $r23 +#define S10 $r11 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 + + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TS, SRC //aoffset + move TD, DST //boffset + slli.d TL, LDA, 0x02 //lda + slli.d TL, TL, 0x01 //lda + + ori T0, ZERO, 0x03 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S9, DST, T0 //boffset2 + + ori T0, ZERO, 0x01 + andn T0, N, T0 + mul.w T0, M, T0 + slli.d T0, T0, 0x01 + slli.d T0, T0, 0x02 + add.d S10, DST, T0 //boffset3 + + srai.d J, M, 0x02 //j + + beq J, ZERO, .L_M1 + +.L_J1: /* if(j>0) j--*/ + move S1, TS //aoffset1 + add.d S2, S1, TL + add.d S3, S2, TL + add.d S4, S3, TL + + slli.d T0, TL, 0x02 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x80 + + srai.d I, N, 0x02 + + beq ZERO, I, .L_JN1 + +.L_JI1: /* if(i>0) i--*/ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + xvst U2, S8, 0x40 + xvst U3, S8, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_JI1 + +.L_JN1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_JN2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + vld $vr2, S3, 0x00 + vld $vr3, S4, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + vst $vr2, S9, 0x20 + vst $vr3, S9, 0x30 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S9, S9, 0x40 + +.L_JN2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_J0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fld.s F4, S3, 0x00 + fld.s F5, S3, 0x04 + + fld.s F6, S4, 0x00 + fld.s F7, S4, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + fst.s F4, S10, 0x10 + fst.s F5, S10, 0x14 + fst.s F6, S10, 0x18 + fst.s F7, S10, 0x1c + + addi.d S10, S10, 0x20 + +.L_J0: + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_M1: /* if(m&2) */ + andi I, M, 0x02 + beq ZERO, I, .L_M2 + + move S1, TS //aoffset1 + add.d S2, S1, TL + + slli.d T0, TL, 0x01 + add.d TS, TS, T0 + + move S8, TD //boffset1 + addi.d TD, TD, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M1N1 + +.L_M1I1: /* if(i>0) */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, S8, 0x00 + xvst U1, S8, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M1I1 + +.L_M1N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M1N2 + + vld $vr0, S1, 0x00 + vld $vr1, S2, 0x00 + + vst $vr0, S9, 0x00 + vst $vr1, S9, 0x10 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S9, S9, 0x20 + +.L_M1N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M2 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fld.s F2, S2, 0x00 + fld.s F3, S2, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + fst.s F2, S10, 0x08 + fst.s F3, S10, 0x0c + + addi.d S10, S10, 0x10 + +.L_M2: /* if(m&1) */ + andi I, M, 0x01 + beq ZERO, I, .L_M0 + + move S1, TS //aoffset1 + move S8, TD //boffset1 + + srai.d I, N, 0x02 + beq ZERO, I, .L_M2N1 + +.L_M2I1: /* if(i>0) */ + xvld U0, S1, 0x00 + + xvst U0, S8, 0x00 + + addi.d S1, S1, 0x20 + slli.d T0, M, 0x05 + add.d S8, S8, T0 + + addi.d I, I, -1 + blt ZERO, I, .L_M2I1 + +.L_M2N1: /* if(n&2) */ + andi I, N, 0x02 + beq ZERO, I, .L_M2N2 + + vld $vr0, S1, 0x00 + + vst $vr0, S9, 0x00 + + addi.d S1, S1, 0x10 + +.L_M2N2: /* if(n&1) */ + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.s F0, S1, 0x00 + fld.s F1, S1, 0x04 + + fst.s F0, S10, 0x00 + fst.s F1, S10, 0x04 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE \ No newline at end of file diff --git a/param.h b/param.h index 5d2e960a2..8bdc03380 100644 --- a/param.h +++ b/param.h @@ -2845,21 +2845,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 #define SGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 1 #else #define DGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define SGEMM_DEFAULT_UNROLL_M 16 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 16 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 8 #endif #define QGEMM_DEFAULT_UNROLL_N 2 -#define CGEMM_DEFAULT_UNROLL_N 2 -#define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #define QGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_M 8 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P 256