From b1eed27a542019a102f97647aa77a219a5124783 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 22 Feb 2021 21:35:42 +0100 Subject: [PATCH] Replace naive omatcopy_rt with 4x4 blocked implementation as suggested by MigMuc in issue 2532 --- kernel/arm/omatcopy_rt.c | 224 ++++++++++++++++++++++++++++++++++----- 1 file changed, 198 insertions(+), 26 deletions(-) diff --git a/kernel/arm/omatcopy_rt.c b/kernel/arm/omatcopy_rt.c index 9d58350d5..d6a3df619 100644 --- a/kernel/arm/omatcopy_rt.c +++ b/kernel/arm/omatcopy_rt.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project +Copyright (c) 2021, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -/***************************************************** - * 2014/06/09 Saar - * - * Order rowMajor - * Trans - * -******************************************************/ - int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) -{ - BLASLONG i,j; - FLOAT *aptr,*bptr; - if ( rows <= 0 ) return(0); - if ( cols <= 0 ) return(0); + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; - aptr = a; + if (rows <= 0) return 0; + if (cols <= 0) return 0; - for ( i=0; i> 2); + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 4; + + j = (cols >> 2); + if (j > 0) { + do { + /* Column 1 of MAT_B */ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + /* Column 2 of MAT_B */ + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + /* Column 3 of MAT_B */ + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + *(b_offset3 + 2) = *(a_offset3 + 2)*alpha; + *(b_offset4 + 2) = *(a_offset3 + 3)*alpha; + + /* Column 4 of MAT_B */ + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + *(b_offset3 + 3) = *(a_offset4 + 2)*alpha; + *(b_offset4 + 3) = *(a_offset4 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } // if(j > 0) + + + if (cols & 2) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + b_offset1 += ldb*2; + + } + + if (cols & 1) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + } + + i--; + } while (i > 0); + } + + + if (rows & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 2; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + b_offset1 += ldb*2; + + } + + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + } + } // if (rows & 2) + + + if (rows & 1) { + a_offset1 = a_offset; + a_offset += lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + a_offset1 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + a_offset1 += 2; + b_offset1 += ldb * 2; + } + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + } + } + + return 0; } -