Replace naive omatcopy_rt with 4x4 blocked implementation

as suggested by MigMuc in issue 2532
This commit is contained in:
Martin Kroeker 2021-02-22 21:35:42 +01:00 committed by GitHub
parent 86a5f98e4a
commit b1eed27a54
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 198 additions and 26 deletions

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
/*****************************************************
* 2014/06/09 Saar
*
* Order rowMajor
* Trans
*
******************************************************/
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
BLASLONG i,j;
FLOAT *aptr,*bptr;
if ( rows <= 0 ) return(0);
if ( cols <= 0 ) return(0);
BLASLONG i, j;
FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4;
aptr = a;
if (rows <= 0) return 0;
if (cols <= 0) return 0;
for ( i=0; i<rows ; i++ )
{
bptr = &b[i];
for(j=0; j<cols; j++)
{
bptr[j*ldb] = alpha * aptr[j];
}
aptr += lda;
}
a_offset = a;
b_offset = b;
return(0);
i = (rows >> 2);
if (i > 0) {
do {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
b_offset1 = b_offset;
b_offset2 = b_offset1 + ldb;
b_offset3 = b_offset2 + ldb;
b_offset4 = b_offset3 + ldb;
b_offset += 4;
j = (cols >> 2);
if (j > 0) {
do {
/* Column 1 of MAT_B */
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
/* Column 2 of MAT_B */
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
/* Column 3 of MAT_B */
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
*(b_offset3 + 2) = *(a_offset3 + 2)*alpha;
*(b_offset4 + 2) = *(a_offset3 + 3)*alpha;
/* Column 4 of MAT_B */
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
*(b_offset3 + 3) = *(a_offset4 + 2)*alpha;
*(b_offset4 + 3) = *(a_offset4 + 3)*alpha;
a_offset1 += 4;
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
b_offset1 += ldb * 4;
b_offset2 += ldb * 4;
b_offset3 += ldb * 4;
b_offset4 += ldb * 4;
j--;
} while (j > 0);
} // if(j > 0)
if (cols & 2) {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
a_offset1 += 2;
a_offset2 += 2;
a_offset3 += 2;
a_offset4 += 2;
b_offset1 += ldb*2;
}
if (cols & 1) {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
}
i--;
} while (i > 0);
}
if (rows & 2) {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
b_offset1 = b_offset;
b_offset2 = b_offset1 + ldb;
b_offset3 = b_offset2 + ldb;
b_offset4 = b_offset3 + ldb;
b_offset += 2;
j = (cols >> 2);
if (j > 0){
do {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
a_offset1 += 4;
a_offset2 += 4;
b_offset1 += ldb * 4;
b_offset2 += ldb * 4;
b_offset3 += ldb * 4;
b_offset4 += ldb * 4;
j--;
} while (j > 0);
}
if (cols & 2){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
a_offset1 += 2;
a_offset2 += 2;
b_offset1 += ldb*2;
}
if (cols & 1){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
}
} // if (rows & 2)
if (rows & 1) {
a_offset1 = a_offset;
a_offset += lda;
b_offset1 = b_offset;
b_offset2 = b_offset1 + ldb;
b_offset3 = b_offset2 + ldb;
b_offset4 = b_offset3 + ldb;
j = (cols >> 2);
if (j > 0){
do {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
a_offset1 += 4;
b_offset1 += ldb * 4;
b_offset2 += ldb * 4;
b_offset3 += ldb * 4;
b_offset4 += ldb * 4;
j--;
} while (j > 0);
}
if (cols & 2){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
a_offset1 += 2;
b_offset1 += ldb * 2;
}
if (cols & 1){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
}
}
return 0;
}