diff --git a/kernel/generic/gemm_ncopy_6.c b/kernel/generic/gemm_ncopy_6.c new file mode 100644 index 000000000..1ecb93c65 --- /dev/null +++ b/kernel/generic/gemm_ncopy_6.c @@ -0,0 +1,230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + j = (n >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + *(b_offset + 4) = ctemp2; + *(b_offset + 5) = ctemp6; + *(b_offset + 6) = ctemp10; + *(b_offset + 7) = ctemp14; + + *(b_offset + 8) = ctemp3; + *(b_offset + 9) = ctemp7; + *(b_offset + 10) = ctemp11; + *(b_offset + 11) = ctemp15; + + *(b_offset + 12) = ctemp4; + *(b_offset + 13) = ctemp8; + *(b_offset + 14) = ctemp12; + *(b_offset + 15) = ctemp16; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + ctemp9 = *(a_offset3 + 0); + ctemp13 = *(a_offset4 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + a_offset1 ++; + a_offset2 ++; + a_offset3 ++; + a_offset4 ++; + + b_offset += 4; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp2; + *(b_offset + 3) = ctemp6; + + *(b_offset + 4) = ctemp3; + *(b_offset + 5) = ctemp7; + *(b_offset + 6) = ctemp4; + *(b_offset + 7) = ctemp8; + + a_offset1 += 4; + a_offset2 += 4; + b_offset += 8; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + + a_offset1 ++; + a_offset2 ++; + b_offset += 2; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 1){ + a_offset1 = a_offset; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + a_offset1 += 4; + b_offset += 4; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + *(b_offset + 0) = ctemp1; + a_offset1 ++; + b_offset += 1; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_6.c b/kernel/generic/gemm_tcopy_6.c new file mode 100644 index 000000000..bd32090e7 --- /dev/null +++ b/kernel/generic/gemm_tcopy_6.c @@ -0,0 +1,281 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + j = (m >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + *(b_offset1 + 8) = ctemp9; + *(b_offset1 + 9) = ctemp10; + *(b_offset1 + 10) = ctemp11; + *(b_offset1 + 11) = ctemp12; + + *(b_offset1 + 12) = ctemp13; + *(b_offset1 + 13) = ctemp14; + *(b_offset1 + 14) = ctemp15; + *(b_offset1 + 15) = ctemp16; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + ctemp5 = *(a_offset3 + 0); + ctemp6 = *(a_offset3 + 1); + + ctemp7 = *(a_offset4 + 0); + ctemp8 = *(a_offset4 + 1); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + *(b_offset2 + 4) = ctemp5; + *(b_offset2 + 5) = ctemp6; + *(b_offset2 + 6) = ctemp7; + *(b_offset2 + 7) = ctemp8; + + b_offset2 += 8; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + ctemp3 = *(a_offset3 + 0); + ctemp4 = *(a_offset4 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + *(b_offset3 + 2) = ctemp3; + *(b_offset3 + 3) = ctemp4; + + b_offset3 += 4; + } + + j--; + }while(j > 0); + } + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + a_offset1 += 4; + a_offset2 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + a_offset1 += 2; + a_offset2 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + b_offset2 += 4; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + a_offset1 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + a_offset1 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + *(b_offset3 + 0) = ctemp1; + } + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_6.c b/kernel/generic/symm_lcopy_6.c new file mode 100644 index 000000000..ac04943e2 --- /dev/null +++ b/kernel/generic/symm_lcopy_6.c @@ -0,0 +1,138 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_ucopy_6.c b/kernel/generic/symm_ucopy_6.c new file mode 100644 index 000000000..9b9cff820 --- /dev/null +++ b/kernel/generic/symm_ucopy_6.c @@ -0,0 +1,136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/trmm_lncopy_6.c b/kernel/generic/trmm_lncopy_6.c new file mode 100644 index 000000000..6cd16673a --- /dev/null +++ b/kernel/generic/trmm_lncopy_6.c @@ -0,0 +1,484 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = data15; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + + } else + if (X < posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data12 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data02; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = ONE; + b[11] = ZERO; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = ZERO; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + b[ 4] = data02; + b[ 5] = data04; + b[ 6] = data06; + b[ 7] = data08; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X < posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + if (m & 1) { + ao1 += lda; + b += 4; + } + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data15 = *(ao4 + 2); + } + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data15; + b += 4; + } +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data06 = *(ao2 + 1); + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data15 = *(ao4 + 2); + } + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data15; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = data06; +#endif + ao1 += 2; + ao2 += 2; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data02; + + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data05; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += 1; + } else + if (X < posY) { + b += 1; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_6.c b/kernel/generic/trmm_ltcopy_6.c new file mode 100644 index 000000000..69a233be6 --- /dev/null +++ b/kernel/generic/trmm_ltcopy_6.c @@ -0,0 +1,488 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data12 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data12; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = ZERO; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X < posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + } + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + } + + if (i >= 3) { + data12 = *(ao3 + 3); + } + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data07; + b[ 3] = data08; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data12; + b += 4; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + } + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data07; + b[ 3] = data08; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data12; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = data06; +#endif + ao1 += 2; + ao2 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + ao1 += 1; + ao2 += 1; + + b += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + b += 1; + ao1 += 1; + } else + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += 1; + b += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_6.c b/kernel/generic/trmm_uncopy_6.c new file mode 100644 index 000000000..70945a246 --- /dev/null +++ b/kernel/generic/trmm_uncopy_6.c @@ -0,0 +1,785 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, mm; + + FLOAT data01, data02, data03, data04, data05, data06; + FLOAT data07, data08, data09, data10, data11, data12; + FLOAT data13, data14, data15, data16, data17, data18; + FLOAT data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30; + FLOAT data31, data32, data33, data34, data35, data36; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6; + + //js = (n >> 2); + js = n/6; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + } + + i = m/6; + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + + data07 = *(ao2 + 0); + data08 = *(ao2 + 1); + data09 = *(ao2 + 2); + data10 = *(ao2 + 3); + data11 = *(ao2 + 4); + data12 = *(ao2 + 5); + + data13 = *(ao3 + 0); + data14 = *(ao3 + 1); + data15 = *(ao3 + 2); + data16 = *(ao3 + 3); + data17 = *(ao3 + 4); + data18 = *(ao3 + 5); + + data19 = *(ao4 + 0); + data20 = *(ao4 + 1); + data21 = *(ao4 + 2); + data22 = *(ao4 + 3); + data23 = *(ao4 + 4); + data24 = *(ao4 + 5); + + data25 = *(ao5 + 0); + data26 = *(ao5 + 1); + data27 = *(ao5 + 2); + data28 = *(ao5 + 3); + data29 = *(ao5 + 4); + data30 = *(ao5 + 5); + + data31 = *(ao6 + 0); + data32 = *(ao6 + 1); + data33 = *(ao6 + 2); + data34 = *(ao6 + 3); + data35 = *(ao6 + 4); + data36 = *(ao6 + 5); + + b[ 0] = data01; + b[ 1] = data07; + b[ 2] = data13; + b[ 3] = data19; + b[ 4] = data25; + b[ 5] = data31; + + b[ 6] = data02; + b[ 7] = data08; + b[ 8] = data14; + b[ 9] = data20; + b[10] = data26; + b[11] = data32; + + b[12] = data03; + b[13] = data09; + b[14] = data15; + b[15] = data21; + b[16] = data27; + b[17] = data33; + + b[18] = data04; + b[19] = data10; + b[20] = data16; + b[21] = data22; + b[22] = data28; + b[23] = data34; + + b[24] = data05; + b[25] = data11; + b[26] = data17; + b[27] = data23; + b[28] = data29; + b[29] = data35; + + b[30] = data06; + b[31] = data12; + b[32] = data18; + b[33] = data24; + b[34] = data30; + b[35] = data36; + + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 6; + b += 36; + } else + if (X > posY) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ZERO; + b[29] = ZERO; + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = ZERO; + + ao1 += 6 * lda; + ao2 += 6 * lda; + ao3 += 6 * lda; + ao4 += 6 * lda; + ao5 += 6 * lda; + ao6 += 6 * lda; + + b += 36; + } else { + data01 = *(ao1 + 0); + data07 = *(ao2 + 0); + data13 = *(ao3 + 0); + data19 = *(ao4 + 0); + data25 = *(ao5 + 0); + data31 = *(ao6 + 0); + + data08 = *(ao2 + 1); + data14 = *(ao3 + 1); + data20 = *(ao4 + 1); + data26 = *(ao5 + 1); + data32 = *(ao6 + 1); + + data15 = *(ao3 + 2); + data21 = *(ao4 + 2); + data27 = *(ao5 + 2); + data33 = *(ao6 + 2); + + data22 = *(ao4 + 3); + data28 = *(ao5 + 3); + data34 = *(ao6 + 3); + + data29 = *(ao5 + 4); + data35 = *(ao6 + 4); + + data36 = *(ao6 + 5); + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = data07; + b[ 2] = data13; + b[ 3] = data19; + b[ 4] = data25; + b[ 5] = data31; + + b[ 6] = ZERO; + b[ 7] = ONE; + b[ 8] = data14; + b[ 9] = data20; + b[10] = data26; + b[11] = data32; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ONE; + b[15] = data21; + b[16] = data27; + b[17] = data33; + + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ONE; + b[22] = data28; + b[23] = data34; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ONE; + b[29] = data35; + + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = ONE; +#else + b[ 0] = data01; + b[ 1] = data07; + b[ 2] = data13; + b[ 3] = data19; + b[ 4] = data25; + b[ 5] = data31; + + b[ 6] = ZERO; + b[ 7] = data08; + b[ 8] = data14; + b[ 9] = data20; + b[10] = data26; + b[11] = data32; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = data15; + b[15] = data21; + b[16] = data27; + b[17] = data33; + + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = data22; + b[22] = data28; + b[23] = data34; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = data29; + b[29] = data35; + + b[30] = ZERO; + b[31] = ZERO; + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = data36; +#endif + + ao1 += 6; + ao2 += 6; + ao3 += 6; + ao4 += 6; + ao5 += 6; + ao6 += 7; + + b += 36; + } + X += 6; + i --; + } while (i > 0); + } + mm = m - m/6; + if (mm & 4) { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = data15; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } else + if (X > posY) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data15; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + + b[ 4] = ZERO; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data15; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + } + X += 4; + } + + if (mm & 3) { + if (X < posY) { + if (mm & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + b[ 4] = data02; + b[ 5] = data04; + b[ 6] = data06; + b[ 7] = data08; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (mm & 1) { + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + data05 = *(ao3 + 0); + data07 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X > posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + if (m & 1) { + ao1 += lda; + b += 4; + } + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data15 = *(ao4 + 2); + } + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data15; + b += 4; + } +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data06 = *(ao2 + 1); + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data15 = *(ao4 + 2); + } + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data15; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = data06; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X > posY) { + ao1 += lda; + ao2 += lda; + b += 2; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data05; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; +#endif + ao1 += lda; + ao2 += lda; + b += 2; + } + } + + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X > posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_6.c b/kernel/generic/trmm_utcopy_6.c new file mode 100644 index 000000000..7d4dba34b --- /dev/null +++ b/kernel/generic/trmm_utcopy_6.c @@ -0,0 +1,472 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data05; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = ONE; + b[11] = ZERO; + + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X > posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + } + + } else { + +#ifdef UNIT + if (i >= 2) { + data05 = *(ao2 + 0); + } + + if (i >= 3) { + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data05; + b[ 1] = ONE; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = ONE; + b[ 3] = ZERO; + b += 4; + } +#else + data01 = *(ao1 + 0); + + if (i >= 2) { + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + } + + if (i >= 3) { + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + } + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data05; + b[ 1] = data06; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = data11; + b[ 3] = ZERO; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = data06; + +#endif + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + ao1 += 2; + b += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + + b[ 0] = data01; + b[ 1] = ZERO; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trsm_kernel_LN.c b/kernel/generic/trsm_kernel_LN.c index 068a202b8..931cba377 100644 --- a/kernel/generic/trsm_kernel_LN.c +++ b/kernel/generic/trsm_kernel_LN.c @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_kernel_LT.c b/kernel/generic/trsm_kernel_LT.c index 300fdd483..099624252 100644 --- a/kernel/generic/trsm_kernel_LT.c +++ b/kernel/generic/trsm_kernel_LT.c @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_kernel_RN.c b/kernel/generic/trsm_kernel_RN.c index b85c3c1e9..d7e650e0c 100644 --- a/kernel/generic/trsm_kernel_RN.c +++ b/kernel/generic/trsm_kernel_RN.c @@ -58,6 +58,10 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_kernel_RT.c b/kernel/generic/trsm_kernel_RT.c index 2adb3a4f7..a46945330 100644 --- a/kernel/generic/trsm_kernel_RT.c +++ b/kernel/generic/trsm_kernel_RT.c @@ -58,6 +58,11 @@ static FLOAT dm1 = -1.; #define GEMM_UNROLL_M_SHIFT 2 #endif +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + #if GEMM_DEFAULT_UNROLL_M == 8 #define GEMM_UNROLL_M_SHIFT 3 #endif diff --git a/kernel/generic/trsm_lncopy_6.c b/kernel/generic/trsm_lncopy_6.c new file mode 100644 index 000000000..9f7bcc2dd --- /dev/null +++ b/kernel/generic/trsm_lncopy_6.c @@ -0,0 +1,326 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + +#ifndef UNIT + data11 = *(a3 + 2); +#endif + data12 = *(a3 + 3); + +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data06); + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = INV(data11); + + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = INV(data16); + } + + if (ii > jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + *(b + 4) = data02; + *(b + 5) = data06; + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = data11; + *(b + 11) = data15; + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = data16; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data06); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + data05 = *(a3 + 0); + data06 = *(a3 + 1); + data07 = *(a4 + 0); + data08 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data05; + *(b + 3) = data07; + *(b + 4) = data02; + *(b + 5) = data04; + *(b + 6) = data06; + *(b + 7) = data08; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + data03 = *(a3 + 0); + data04 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data02; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_6.c b/kernel/generic/trsm_ltcopy_6.c new file mode 100644 index 000000000..d891468a4 --- /dev/null +++ b/kernel/generic/trsm_ltcopy_6.c @@ -0,0 +1,346 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + +#ifndef UNIT + data11 = *(a3 + 2); +#endif + data12 = *(a3 + 3); + +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data06); + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 10) = INV(data11); + *(b + 11) = data12; + + *(b + 15) = INV(data16); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data06); + *(b + 6) = data07; + *(b + 7) = data08; + + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_6.c b/kernel/generic/trsm_uncopy_6.c new file mode 100644 index 000000000..837a25019 --- /dev/null +++ b/kernel/generic/trsm_uncopy_6.c @@ -0,0 +1,350 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); +#ifndef UNIT + data11 = *(a3 + 2); +#endif + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + + *(b + 5) = INV(data06); + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 10) = INV(data11); + *(b + 11) = data15; + + *(b + 15) = INV(data16); + } + + if (ii < jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + *(b + 4) = data02; + *(b + 5) = data06; + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = data11; + *(b + 11) = data15; + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = data16; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + + *(b + 5) = INV(data06); + *(b + 6) = data10; + *(b + 7) = data14; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + data05 = *(a3 + 0); + data06 = *(a3 + 1); + data07 = *(a4 + 0); + data08 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2; + a2 += 2; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); + data09 = *(a3 + 0); + data13 = *(a4 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + data03 = *(a3 + 0); + data04 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data03; + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data03; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_6.c b/kernel/generic/trsm_utcopy_6.c new file mode 100644 index 000000000..bbba78d53 --- /dev/null +++ b/kernel/generic/trsm_utcopy_6.c @@ -0,0 +1,322 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); +#ifndef UNIT + data11 = *(a3 + 2); +#endif + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data05; + *(b + 5) = INV(data06); + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = INV(data11); + + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = INV(data16); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data05; + *(b + 5) = INV(data06); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data03; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 8ebd42244..bbec23ccf 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -16,15 +16,17 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S -DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S -DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S -DGEMMONCOPY = gemm_ncopy_2_bulldozer.S -DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S + +DGEMMKERNEL = dgemm_kernel_6x4_piledriver.S +DGEMMINCOPY = ../generic/gemm_ncopy_6.c +DGEMMITCOPY = ../generic/gemm_tcopy_6.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c diff --git a/kernel/x86_64/dgemm_kernel_6x4_piledriver.S b/kernel/x86_64/dgemm_kernel_6x4_piledriver.S new file mode 100644 index 000000000..7b5dd1587 --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_6x4_piledriver.S @@ -0,0 +1,1734 @@ +/**************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +// register blocking= 6x4. unloop k = 4. +// Use FMA3 on piledriver. +// Todo: 1) deal with the edge. 2) Add windows abi. + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 128 +#define oldbk_i %rdi +#define oldbk_j %rsi +#define oldbk_l %rdx + +#define _bk_i %r13 +#define _bk_j %r14 +#define _bk_l %r15 + +#define ALPHA %xmm0 +#define _ptr_A %rcx +#define _ptr_B %r8 +#define _ptr_C %r9 +#define LDC %r10 + +#define i %r11 +#define k %rax +#define _pre_B %r12 +#define _ptr__A_0 %rdi +#define _ptr__B_0 %rsi +#define _ptr__C_0 %rbx +#define _ptr__C_1 %rbp + +#define old_ldc 8+STACKSIZE(%rsp) +#define alpha 48(%rsp) +#define j 56(%rsp) + +#define MOVQ2560(s,d) movq s,d +#define LEAQ2560(s,d) leaq s,d +#define SARQ2560(imm,n) sarq imm,n +#define ADDQ2560(off,addr) addq off,addr +#define SUBQ2560(off,addr) subq off,addr +#define DIVQ2560(off,addr) divq off,addr +#define MULQ2560(s,d) mulq s,d +#define DECQ2560(addr) decq addr +#define NEGQ2560(s) negq s +#define TESTQ2560(n,addr) testq n,addr +#define SALQ2560(imm,n) salq imm,n + +#define MOVQ1280(s,d) movq s,d +#define LEAQ1280(s,d) leaq s,d +#define SARQ1280(imm,n) sarq imm,n +#define ADDQ1280(off,addr) addq off,addr +#define SUBQ1280(off,addr) subq off,addr +#define DIVQ1280(off,addr) divq off,addr +#define CMPQ1280(off,addr) cmpq off,addr +#define MULQ1280(s,d) mulq s,d +#define DECQ1280(addr) decq addr +#define NEGQ1280(s) negq s +#define TESTQ1280(n,addr) testq n,addr +#define SALQ1280(imm,n) salq imm,n + +#define JG jg +#define JLE jle + +#define VLD2560(addr,reg) vmovapd addr,reg +#define VST2560(reg,addr) vmovapd reg,addr +#define VMUL2560(a,b,c) vmulpd a,b,c +#define MVMUL2560(a,b,c) vmulpd b,a,c +#define VADD2560(a,b,c) vaddpd a,b,c +#define MVADD2560(a,b,c) vaddpd b,a,c +#define VSHUF2560(imm,s,d) vpermilpd imm,s,d +#define VSHUF2F2560(imm,s1,s2,d) vperm2f128 imm,s1,s2,d +#define BROAD2560(addr,reg) vbroadcastsd addr,reg +#define MOVRR2560(a,b) vmovapd a,b +#define REVS2560(imm,s1,s2,d) vshufpd imm,s1,s2,d +#define EXTR2561(imm,a,b) vextractf128 imm,a,b +#define LDL2561(addr,reg) vmovlpd addr,reg,reg +#define LDH2561(addr,reg) vmovhpd addr,reg,reg +#define STL2561(reg,addr) vmovlpd reg,addr +#define STH2561(reg,addr) vmovhpd reg,addr +#define VADD2561(a,b,c) vaddpd a,b,c +#define VXOR2560(a,b,c) vxorpd a,b,c +#define PREFETCH02560(addr,b) prefetcht0 addr +#define PREFETCH12560(addr,b) prefetcht0 addr +#define PREFETCH22560(addr,b) prefetcht2 addr +#define PREFETCHW2560(addr,b) prefetchw addr +#define PREFETCHN2560(addr,b) prefetchnta addr +#define VMA2560(a,b,c,d) vfmaddpd d,a,b,c +#define MVMA2560(a,b,c,d) vfmaddpd d,a,b,c + +#define VLD1280(addr,reg) vmovapd addr,reg +#define VLD1282(addr,reg) vmovapd addr,reg +#define VLD1281(addr,reg) movsd addr,reg +#define VST1280(reg,addr) vmovapd reg,addr +#define VST1282(reg,addr) vmovapd reg,addr +#define VST1281(reg,addr) movsd reg,addr +#define VLDU1282(addr,reg) vmovupd addr,reg +#define VLDU1281(addr,reg) movsd addr,reg +#define VSTU1282(reg,addr) vmovupd reg,addr +#define VSTU1281(reg,addr) movsd reg,addr +#define VMUL1280(a,b,c) vmulpd a,b,c +#define VMUL1282(a,b,c) vmulpd a,b,c +#define VMUL1281(a,b,c) vmulpd a,b,c +#define MVMUL1280(a,b,c) vmulpd b,a,c +#define VADD1280(a,b,c) vaddpd a,b,c +#define MVADD1280(a,b,c) vaddpd b,a,c +#define VSHUF1280(imm,s,d) vpermilpd imm,s,d +#define VSHUF2F1280(imm,s1,s2,d) vperm2f128 imm,s1,s2,d +#define BROAD1280(addr,reg) vmovddup addr,reg +#define BROAD1282(addr,reg) vmovddup addr,reg +#define BROAD1281(addr,reg) movddup addr,reg +#define MOVRR1280(a,b) vmovapd a,b +#define REVS1280(imm,s1,s2,d) vshufpd imm,s1,s2,d +#define EXTR1281(imm,a,b) vextractf128 imm,a,b +#define LDL1281(addr,reg) vmovlpd addr,reg,reg +#define LDH1281(addr,reg) vmovhpd addr,reg,reg +#define STL1281(reg,addr) vmovlpd reg,addr +#define STH1281(reg,addr) vmovhpd reg,addr +#define VADD1281(a,b,c) vaddpd a,b,c +#define VXOR1280(a,b,c) vxorpd a,b,c +#define VXOR1282(a,b,c) vxorpd a,b,c +#define VXOR1281(a,b,c) vxorpd a,b,c +#define PREFETCH01280(addr,b) prefetcht0 addr +#define PREFETCH11280(addr,b) prefetcht0 addr +#define PREFETCH21280(addr,b) prefetcht2 addr +#define PREFETCHW1280(addr,b) prefetchw addr +#define PREFETCHN1280(addr,b) prefetchnta addr +#define VMA1280(a,b,c,d) vfmaddpd d,a,b,c +#define VMA1282(a,b,c,d) vfmadd231pd a,b,c +#define VMA1281(a,b,c,d) vfmadd231pd a,b,c +#define VMA21282(a,b,c,d) vfmadd231pd a,b,c +#define VMA21281(a,b,c,d) vfmadd231pd a,b,c +//#define VMA1282(a,b,c,d) nop +//#define VMA1281(a,b,c,d) nop +//#define VMA21282(a,b,c,d) nop +//#define VMA21281(a,b,c,d) nop +#define MVMA1280(a,b,c,d) vfmaddpd d,a,b,c + +#define imm1 $0x05 +#define imm3 $0x05 +#define imm100 $0x05 +#define imm200 $0x0a + +#define XMM0 %xmm0 +#define XMM1 %xmm1 +#define XMM2 %xmm2 +#define XMM3 %xmm3 +#define XMM4 %xmm4 +#define XMM5 %xmm5 +#define XMM6 %xmm6 +#define XMM7 %xmm7 +#define XMM8 %xmm8 +#define XMM9 %xmm9 +#define XMM10 %xmm10 +#define XMM11 %xmm11 +#define XMM12 %xmm12 +#define XMM13 %xmm13 +#define XMM14 %xmm14 +#define XMM15 %xmm15 + +#define YMM0 %ymm0 +#define YMM1 %ymm1 +#define YMM2 %ymm2 +#define YMM3 %ymm3 +#define YMM4 %ymm4 +#define YMM5 %ymm5 +#define YMM6 %ymm6 +#define YMM7 %ymm7 +#define YMM8 %ymm8 +#define YMM9 %ymm9 +#define YMM10 %ymm10 +#define YMM11 %ymm11 +#define YMM12 %ymm12 +#define YMM13 %ymm13 +#define YMM14 %ymm14 +#define YMM15 %ymm15 +PROLOGUE + +subq $STACKSIZE, %rsp; +movq %rbx, 0(%rsp); +movq %rbp, 8(%rsp); +movq %r12, 16(%rsp); +movq %r13, 24(%rsp); +movq %r14, 32(%rsp); +movq %r15, 40(%rsp); +vzeroupper +movl old_ldc, %eax +movq %rax, LDC +movlps ALPHA, alpha +movq oldbk_i, _bk_i +movq oldbk_j, _bk_j +movq oldbk_l, _bk_l +leaq (, LDC, SIZE), LDC + +MOVQ1280(_bk_j,j); +SARQ1280($2,j); +JLE ._L_0_loopE; +ALIGN_4; +._L_0_bodyB:; +MOVQ1280(_ptr_A,_ptr__A_0); +MOVQ1280(_ptr_C,_ptr__C_0); +LEAQ1280((_ptr_C,LDC,2),_ptr__C_1); +MOVQ1280(_bk_l,%rax); +SALQ1280($5,%rax); +ADDQ1280(%rax,_pre_B); +MOVQ1280(_bk_i,i); +CMPQ1280($6,i); +JL ._L_1_loopE; +._L_1_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +VXOR1282(XMM4,XMM4,XMM4); +VXOR1282(XMM5,XMM5,XMM5); +VXOR1282(XMM6,XMM6,XMM6); +VXOR1282(XMM7,XMM7,XMM7); +VXOR1282(XMM8,XMM8,XMM8); +VXOR1282(XMM9,XMM9,XMM9); +VXOR1282(XMM10,XMM10,XMM10); +VXOR1282(XMM11,XMM11,XMM11); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_2_loopE; +ALIGN_4; +._L_2_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(8*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(16*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(9*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(10*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(11*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(12*SIZE(_ptr__B_0),XMM15); +VLD1282(18*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(20*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(22*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(13*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(14*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(15*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +ADDQ1280($24*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_2_bodyE:; +DECQ1280(k); +JG ._L_2_bodyB; +ALIGN_4; +._L_2_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_3_loopE; +ALIGN_4; +._L_3_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +ADDQ1280($12*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_3_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_4_loopE; +ALIGN_4; +._L_4_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM6,XMM6); +VMA1282(XMM13,XMM15,XMM7,XMM7); +VMA1282(XMM14,XMM15,XMM8,XMM8); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM9,XMM9); +VMA1282(XMM13,XMM15,XMM10,XMM10); +VMA1282(XMM14,XMM15,XMM11,XMM11); +ADDQ1280($6*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_4_loopE:; +BROAD1282(alpha,XMM12); +VLDU1282(0*SIZE(_ptr__C_0),XMM13); +VMA21282(XMM12,XMM0,XMM13,XMM0); +VSTU1282(XMM13,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM14); +VMA21282(XMM12,XMM1,XMM14,XMM1); +VSTU1282(XMM14,2*SIZE(_ptr__C_0)); +VLDU1282(4*SIZE(_ptr__C_0),XMM15); +VMA21282(XMM12,XMM2,XMM15,XMM2); +VSTU1282(XMM15,4*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM13); +VMA21282(XMM12,XMM3,XMM13,XMM3); +VSTU1282(XMM13,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM14); +VMA21282(XMM12,XMM4,XMM14,XMM4); +VSTU1282(XMM14,2*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM15); +VMA21282(XMM12,XMM5,XMM15,XMM5); +VSTU1282(XMM15,4*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(0*SIZE(_ptr__C_1),XMM13); +VMA21282(XMM12,XMM6,XMM13,XMM6); +VSTU1282(XMM13,0*SIZE(_ptr__C_1)); +VLDU1282(2*SIZE(_ptr__C_1),XMM14); +VMA21282(XMM12,XMM7,XMM14,XMM7); +VSTU1282(XMM14,2*SIZE(_ptr__C_1)); +VLDU1282(4*SIZE(_ptr__C_1),XMM15); +VMA21282(XMM12,XMM8,XMM15,XMM8); +VSTU1282(XMM15,4*SIZE(_ptr__C_1)); +VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM13); +VMA21282(XMM12,XMM9,XMM13,XMM9); +VSTU1282(XMM13,0*SIZE(_ptr__C_1,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM14); +VMA21282(XMM12,XMM10,XMM14,XMM10); +VSTU1282(XMM14,2*SIZE(_ptr__C_1,LDC,1)); +VLDU1282(4*SIZE(_ptr__C_1,LDC,1),XMM15); +VMA21282(XMM12,XMM11,XMM15,XMM11); +VSTU1282(XMM15,4*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($6*SIZE,_ptr__C_0); +ADDQ1280($6*SIZE,_ptr__C_1); +._L_1_bodyE:; +SUBQ1280($6,i); +JG ._L_1_bodyB; +ALIGN_4; +._L_1_loopE:; +TESTQ1280($4,i); +JLE ._L_5_loopE; +ALIGN_4; +._L_5_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +VXOR1282(XMM4,XMM4,XMM4); +VXOR1282(XMM5,XMM5,XMM5); +VXOR1282(XMM6,XMM6,XMM6); +VXOR1282(XMM7,XMM7,XMM7); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_6_loopE; +ALIGN_4; +._L_6_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(8*SIZE(_ptr__B_0),XMM15); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(9*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(10*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(11*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(12*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(13*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(14*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(15*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +ADDQ1280($16*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_6_bodyE:; +DECQ1280(k); +JG ._L_6_bodyB; +ALIGN_4; +._L_6_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_7_loopE; +ALIGN_4; +._L_7_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_7_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_8_loopE; +ALIGN_4; +._L_8_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM6,XMM6); +VMA1282(XMM14,XMM15,XMM7,XMM7); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_8_loopE:; +BROAD1282(alpha,XMM8); +VLDU1282(0*SIZE(_ptr__C_0),XMM9); +VMA21282(XMM8,XMM0,XMM9,XMM0); +VSTU1282(XMM9,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM10); +VMA21282(XMM8,XMM1,XMM10,XMM1); +VSTU1282(XMM10,2*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM11); +VMA21282(XMM8,XMM2,XMM11,XMM2); +VSTU1282(XMM11,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM12); +VMA21282(XMM8,XMM3,XMM12,XMM3); +VSTU1282(XMM12,2*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(0*SIZE(_ptr__C_1),XMM13); +VMA21282(XMM8,XMM4,XMM13,XMM4); +VSTU1282(XMM13,0*SIZE(_ptr__C_1)); +VLDU1282(2*SIZE(_ptr__C_1),XMM14); +VMA21282(XMM8,XMM5,XMM14,XMM5); +VSTU1282(XMM14,2*SIZE(_ptr__C_1)); +VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM15); +VMA21282(XMM8,XMM6,XMM15,XMM6); +VSTU1282(XMM15,0*SIZE(_ptr__C_1,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM9); +VMA21282(XMM8,XMM7,XMM9,XMM7); +VSTU1282(XMM9,2*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($4*SIZE,_ptr__C_0); +ADDQ1280($4*SIZE,_ptr__C_1); +._L_5_loopE:; +TESTQ1280($2,i); +JLE ._L_9_loopE; +ALIGN_4; +._L_9_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_10_loopE; +ALIGN_4; +._L_10_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(8*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(9*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(10*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(11*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(12*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(13*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(14*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(15*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_10_bodyE:; +DECQ1280(k); +JG ._L_10_bodyB; +ALIGN_4; +._L_10_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_11_loopE; +ALIGN_4; +._L_11_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_11_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_12_loopE; +ALIGN_4; +._L_12_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_12_loopE:; +BROAD1282(alpha,XMM4); +VLDU1282(0*SIZE(_ptr__C_0),XMM5); +VMA21282(XMM4,XMM0,XMM5,XMM0); +VSTU1282(XMM5,0*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM6); +VMA21282(XMM4,XMM1,XMM6,XMM1); +VSTU1282(XMM6,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(0*SIZE(_ptr__C_1),XMM7); +VMA21282(XMM4,XMM2,XMM7,XMM2); +VSTU1282(XMM7,0*SIZE(_ptr__C_1)); +VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM8); +VMA21282(XMM4,XMM3,XMM8,XMM3); +VSTU1282(XMM8,0*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($2*SIZE,_ptr__C_0); +ADDQ1280($2*SIZE,_ptr__C_1); +._L_9_loopE:; +TESTQ1280($1,i); +JLE ._L_13_loopE; +ALIGN_4; +._L_13_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1281(XMM0,XMM0,XMM0); +VXOR1281(XMM1,XMM1,XMM1); +VXOR1281(XMM2,XMM2,XMM2); +VXOR1281(XMM3,XMM3,XMM3); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_14_loopE; +ALIGN_4; +._L_14_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(4*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(5*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(6*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(7*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1281(8*SIZE(_ptr__B_0),XMM15); +VLD1281(2*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(9*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(10*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(11*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1281(12*SIZE(_ptr__B_0),XMM15); +VLD1281(3*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(13*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(14*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(15*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($16*SIZE,_ptr__B_0); +._L_14_bodyE:; +DECQ1280(k); +JG ._L_14_bodyB; +ALIGN_4; +._L_14_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_15_loopE; +ALIGN_4; +._L_15_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(4*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(5*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(6*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(7*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_15_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_16_loopE; +ALIGN_4; +._L_16_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM2,XMM2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($1*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_16_loopE:; +BROAD1281(alpha,XMM4); +VLDU1281(0*SIZE(_ptr__C_0),XMM5); +VMA21281(XMM4,XMM0,XMM5,XMM0); +VSTU1281(XMM5,0*SIZE(_ptr__C_0)); +VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM6); +VMA21281(XMM4,XMM1,XMM6,XMM1); +VSTU1281(XMM6,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1281(0*SIZE(_ptr__C_1),XMM7); +VMA21281(XMM4,XMM2,XMM7,XMM2); +VSTU1281(XMM7,0*SIZE(_ptr__C_1)); +VLDU1281(0*SIZE(_ptr__C_1,LDC,1),XMM8); +VMA21281(XMM4,XMM3,XMM8,XMM3); +VSTU1281(XMM8,0*SIZE(_ptr__C_1,LDC,1)); +ADDQ1280($1*SIZE,_ptr__C_0); +ADDQ1280($1*SIZE,_ptr__C_1); +._L_13_loopE:; +MOVQ1280(LDC,%rax); +SALQ1280($2,%rax); +ADDQ1280(%rax,_ptr_C); +MOVQ1280(_bk_l,%rax); +SALQ1280($5,%rax); +ADDQ1280(%rax,_ptr_B); +._L_0_bodyE:; +DECQ1280(j); +JG ._L_0_bodyB; +ALIGN_4; +._L_0_loopE:; +TESTQ1280($2,_bk_j); +JLE ._L_17_loopE; +ALIGN_4; +._L_17_bodyB:; +MOVQ1280(_ptr_A,_ptr__A_0); +MOVQ1280(_ptr_C,_ptr__C_0); +LEAQ1280((_ptr_C,LDC,1),_ptr__C_1); +MOVQ1280(_bk_l,%rax); +SALQ1280($4,%rax); +ADDQ1280(%rax,_pre_B); +MOVQ1280(_bk_i,i); +CMPQ1280($6,i); +JL ._L_18_loopE; +._L_18_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +VXOR1282(XMM4,XMM4,XMM4); +VXOR1282(XMM5,XMM5,XMM5); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_19_loopE; +ALIGN_4; +._L_19_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(16*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VLD1282(18*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(20*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(22*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +ADDQ1280($24*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_19_bodyE:; +DECQ1280(k); +JG ._L_19_bodyB; +ALIGN_4; +._L_19_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_20_loopE; +ALIGN_4; +._L_20_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +ADDQ1280($12*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_20_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_21_loopE; +ALIGN_4; +._L_21_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM12,XMM15,XMM3,XMM3); +VMA1282(XMM13,XMM15,XMM4,XMM4); +VMA1282(XMM14,XMM15,XMM5,XMM5); +ADDQ1280($6*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_21_loopE:; +BROAD1282(alpha,XMM6); +VLDU1282(0*SIZE(_ptr__C_0),XMM7); +VMA21282(XMM6,XMM0,XMM7,XMM0); +VSTU1282(XMM7,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM8); +VMA21282(XMM6,XMM1,XMM8,XMM1); +VSTU1282(XMM8,2*SIZE(_ptr__C_0)); +VLDU1282(4*SIZE(_ptr__C_0),XMM9); +VMA21282(XMM6,XMM2,XMM9,XMM2); +VSTU1282(XMM9,4*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM10); +VMA21282(XMM6,XMM3,XMM10,XMM3); +VSTU1282(XMM10,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM11); +VMA21282(XMM6,XMM4,XMM11,XMM4); +VSTU1282(XMM11,2*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM12); +VMA21282(XMM6,XMM5,XMM12,XMM5); +VSTU1282(XMM12,4*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($6*SIZE,_ptr__C_0); +ADDQ1280($6*SIZE,_ptr__C_1); +._L_18_bodyE:; +SUBQ1280($6,i); +JG ._L_18_bodyB; +ALIGN_4; +._L_18_loopE:; +TESTQ1280($4,i); +JLE ._L_22_loopE; +ALIGN_4; +._L_22_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +VXOR1282(XMM3,XMM3,XMM3); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_23_loopE; +ALIGN_4; +._L_23_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($16*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_23_bodyE:; +DECQ1280(k); +JG ._L_23_bodyB; +ALIGN_4; +._L_23_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_24_loopE; +ALIGN_4; +._L_24_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_24_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_25_loopE; +ALIGN_4; +._L_25_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM13,XMM15,XMM2,XMM2); +VMA1282(XMM14,XMM15,XMM3,XMM3); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_25_loopE:; +BROAD1282(alpha,XMM4); +VLDU1282(0*SIZE(_ptr__C_0),XMM5); +VMA21282(XMM4,XMM0,XMM5,XMM0); +VSTU1282(XMM5,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM6); +VMA21282(XMM4,XMM1,XMM6,XMM1); +VSTU1282(XMM6,2*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM7); +VMA21282(XMM4,XMM2,XMM7,XMM2); +VSTU1282(XMM7,0*SIZE(_ptr__C_0,LDC,1)); +VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM8); +VMA21282(XMM4,XMM3,XMM8,XMM3); +VSTU1282(XMM8,2*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($4*SIZE,_ptr__C_0); +ADDQ1280($4*SIZE,_ptr__C_1); +._L_22_loopE:; +TESTQ1280($2,i); +JLE ._L_26_loopE; +ALIGN_4; +._L_26_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_27_loopE; +ALIGN_4; +._L_27_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(4*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(5*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(6*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(7*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_27_bodyE:; +DECQ1280(k); +JG ._L_27_bodyB; +ALIGN_4; +._L_27_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_28_loopE; +ALIGN_4; +._L_28_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_28_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_29_loopE; +ALIGN_4; +._L_29_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_29_loopE:; +BROAD1282(alpha,XMM2); +VLDU1282(0*SIZE(_ptr__C_0),XMM3); +VMA21282(XMM2,XMM0,XMM3,XMM0); +VSTU1282(XMM3,0*SIZE(_ptr__C_0)); +VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM4); +VMA21282(XMM2,XMM1,XMM4,XMM1); +VSTU1282(XMM4,0*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($2*SIZE,_ptr__C_0); +ADDQ1280($2*SIZE,_ptr__C_1); +._L_26_loopE:; +TESTQ1280($1,i); +JLE ._L_30_loopE; +ALIGN_4; +._L_30_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1281(XMM0,XMM0,XMM0); +VXOR1281(XMM1,XMM1,XMM1); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_31_loopE; +ALIGN_4; +._L_31_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1281(4*SIZE(_ptr__B_0),XMM15); +VLD1281(2*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(5*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1281(6*SIZE(_ptr__B_0),XMM15); +VLD1281(3*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(7*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($8*SIZE,_ptr__B_0); +._L_31_bodyE:; +DECQ1280(k); +JG ._L_31_bodyB; +ALIGN_4; +._L_31_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_32_loopE; +ALIGN_4; +._L_32_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_32_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_33_loopE; +ALIGN_4; +._L_33_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VMA1281(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($1*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_33_loopE:; +BROAD1281(alpha,XMM2); +VLDU1281(0*SIZE(_ptr__C_0),XMM3); +VMA21281(XMM2,XMM0,XMM3,XMM0); +VSTU1281(XMM3,0*SIZE(_ptr__C_0)); +VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM4); +VMA21281(XMM2,XMM1,XMM4,XMM1); +VSTU1281(XMM4,0*SIZE(_ptr__C_0,LDC,1)); +ADDQ1280($1*SIZE,_ptr__C_0); +ADDQ1280($1*SIZE,_ptr__C_1); +._L_30_loopE:; +MOVQ1280(LDC,%rax); +SALQ1280($1,%rax); +ADDQ1280(%rax,_ptr_C); +MOVQ1280(_bk_l,%rax); +SALQ1280($4,%rax); +ADDQ1280(%rax,_ptr_B); +._L_17_loopE:; +TESTQ1280($1,_bk_j); +JLE ._L_34_loopE; +ALIGN_4; +._L_34_bodyB:; +MOVQ1280(_ptr_A,_ptr__A_0); +MOVQ1280(_ptr_C,_ptr__C_0); +MOVQ1280(_bk_l,%rax); +SALQ1280($3,%rax); +ADDQ1280(%rax,_pre_B); +MOVQ1280(_bk_i,i); +CMPQ1280($6,i); +JL ._L_35_loopE; +._L_35_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +VXOR1282(XMM2,XMM2,XMM2); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_36_loopE; +ALIGN_4; +._L_36_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(16*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VLD1282(18*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(20*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(22*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +ADDQ1280($24*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_36_bodyE:; +DECQ1280(k); +JG ._L_36_bodyB; +ALIGN_4; +._L_36_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_37_loopE; +ALIGN_4; +._L_37_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +ADDQ1280($12*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_37_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_38_loopE; +ALIGN_4; +._L_38_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM12); +VMA1282(XMM12,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM1,XMM1); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM2,XMM2); +ADDQ1280($6*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_38_loopE:; +BROAD1282(alpha,XMM3); +VLDU1282(0*SIZE(_ptr__C_0),XMM4); +VMA21282(XMM3,XMM0,XMM4,XMM0); +VSTU1282(XMM4,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM5); +VMA21282(XMM3,XMM1,XMM5,XMM1); +VSTU1282(XMM5,2*SIZE(_ptr__C_0)); +VLDU1282(4*SIZE(_ptr__C_0),XMM6); +VMA21282(XMM3,XMM2,XMM6,XMM2); +VSTU1282(XMM6,4*SIZE(_ptr__C_0)); +ADDQ1280($6*SIZE,_ptr__C_0); +ADDQ1280($6*SIZE,_ptr__C_1); +._L_35_bodyE:; +SUBQ1280($6,i); +JG ._L_35_bodyB; +ALIGN_4; +._L_35_loopE:; +TESTQ1280($4,i); +JLE ._L_39_loopE; +ALIGN_4; +._L_39_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +VXOR1282(XMM1,XMM1,XMM1); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_40_loopE; +ALIGN_4; +._L_40_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(8*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(10*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VLD1282(12*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(14*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($16*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_40_bodyE:; +DECQ1280(k); +JG ._L_40_bodyB; +ALIGN_4; +._L_40_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_41_loopE; +ALIGN_4; +._L_41_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_41_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_42_loopE; +ALIGN_4; +._L_42_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM13); +VMA1282(XMM13,XMM15,XMM0,XMM0); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM1,XMM1); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_42_loopE:; +BROAD1282(alpha,XMM2); +VLDU1282(0*SIZE(_ptr__C_0),XMM3); +VMA21282(XMM2,XMM0,XMM3,XMM0); +VSTU1282(XMM3,0*SIZE(_ptr__C_0)); +VLDU1282(2*SIZE(_ptr__C_0),XMM4); +VMA21282(XMM2,XMM1,XMM4,XMM1); +VSTU1282(XMM4,2*SIZE(_ptr__C_0)); +ADDQ1280($4*SIZE,_ptr__C_0); +ADDQ1280($4*SIZE,_ptr__C_1); +._L_39_loopE:; +TESTQ1280($2,i); +JLE ._L_43_loopE; +ALIGN_4; +._L_43_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1282(XMM0,XMM0,XMM0); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_44_loopE; +ALIGN_4; +._L_44_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1282(2*SIZE(_ptr__B_0),XMM15); +VLD1282(4*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1282(3*SIZE(_ptr__B_0),XMM15); +VLD1282(6*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($8*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_44_bodyE:; +DECQ1280(k); +JG ._L_44_bodyB; +ALIGN_4; +._L_44_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_45_loopE; +ALIGN_4; +._L_45_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1282(1*SIZE(_ptr__B_0),XMM15); +VLD1282(2*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_45_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_46_loopE; +ALIGN_4; +._L_46_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1282(0*SIZE(_ptr__B_0),XMM15); +VLD1282(0*SIZE(_ptr__A_0),XMM14); +VMA1282(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_46_loopE:; +BROAD1282(alpha,XMM1); +VLDU1282(0*SIZE(_ptr__C_0),XMM2); +VMA21282(XMM1,XMM0,XMM2,XMM0); +VSTU1282(XMM2,0*SIZE(_ptr__C_0)); +ADDQ1280($2*SIZE,_ptr__C_0); +ADDQ1280($2*SIZE,_ptr__C_1); +._L_43_loopE:; +TESTQ1280($1,i); +JLE ._L_47_loopE; +ALIGN_4; +._L_47_bodyB:; +MOVQ1280(_ptr_B,_ptr__B_0); +VXOR1281(XMM0,XMM0,XMM0); +PREFETCHN1280(3*SIZE(_ptr__C_0),N); +PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); +PREFETCHN1280(3*SIZE(_ptr__C_1),N); +PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); +MOVQ1280(_bk_l,k); +SARQ1280($2,k); +JLE ._L_48_loopE; +ALIGN_4; +._L_48_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH01280(176*SIZE(_ptr__A_0),0); +BROAD1281(2*SIZE(_ptr__B_0),XMM15); +VLD1281(2*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(184*SIZE(_ptr__A_0),2); +BROAD1281(3*SIZE(_ptr__B_0),XMM15); +VLD1281(3*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($4*SIZE,_ptr__A_0); +ADDQ1280($4*SIZE,_ptr__B_0); +._L_48_bodyE:; +DECQ1280(k); +JG ._L_48_bodyB; +ALIGN_4; +._L_48_loopE:; +TESTQ1280($2,_bk_l); +JLE ._L_49_loopE; +ALIGN_4; +._L_49_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +PREFETCH21280(168*SIZE(_ptr__A_0),2); +BROAD1281(1*SIZE(_ptr__B_0),XMM15); +VLD1281(1*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($2*SIZE,_ptr__A_0); +ADDQ1280($2*SIZE,_ptr__B_0); +._L_49_loopE:; +TESTQ1280($1,_bk_l); +JLE ._L_50_loopE; +ALIGN_4; +._L_50_bodyB:; +PREFETCH01280(160*SIZE(_ptr__A_0),0); +BROAD1281(0*SIZE(_ptr__B_0),XMM15); +VLD1281(0*SIZE(_ptr__A_0),XMM14); +VMA1281(XMM14,XMM15,XMM0,XMM0); +ADDQ1280($1*SIZE,_ptr__A_0); +ADDQ1280($1*SIZE,_ptr__B_0); +._L_50_loopE:; +BROAD1281(alpha,XMM1); +VLDU1281(0*SIZE(_ptr__C_0),XMM2); +VMA21281(XMM1,XMM0,XMM2,XMM0); +VSTU1281(XMM2,0*SIZE(_ptr__C_0)); +ADDQ1280($1*SIZE,_ptr__C_0); +ADDQ1280($1*SIZE,_ptr__C_1); +._L_47_loopE:; +MOVQ1280(LDC,%rax); +ADDQ1280(%rax,_ptr_C); +MOVQ1280(_bk_l,%rax); +SALQ1280($3,%rax); +ADDQ1280(%rax,_ptr_B); +._L_34_loopE:; +vzeroupper +movq 0(%rsp), %rbx; +movq 8(%rsp), %rbp; +movq 16(%rsp), %r12; +movq 24(%rsp), %r13; +movq 32(%rsp), %r14; +movq 40(%rsp), %r15; +addq $STACKSIZE, %rsp; +ret + +EPILOGUE diff --git a/param.h b/param.h index 0c3df6951..c7a9635a6 100644 --- a/param.h +++ b/param.h @@ -330,9 +330,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_UNROLL_M 1 #else #define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 6 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 2 @@ -347,10 +347,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 384 +#define DGEMM_DEFAULT_P 480 #else #define SGEMM_DEFAULT_P 448 -#define DGEMM_DEFAULT_P 224 +#define DGEMM_DEFAULT_P 480 #endif #define QGEMM_DEFAULT_P 112 #define CGEMM_DEFAULT_P 224 @@ -359,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(ARCH_X86_64) #define SGEMM_DEFAULT_Q 168 -#define DGEMM_DEFAULT_Q 168 +#define DGEMM_DEFAULT_Q 128 #else #define SGEMM_DEFAULT_Q 224 #define DGEMM_DEFAULT_Q 224