diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 18a218cec..02d15b7f3 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -117,5 +117,9 @@ In chronological order: * Isaac Dunham * [2014-08-03] Fixed link error on Linux/musl +* Dave Nuechterlein + * [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1). + ARMv8 support. + * [Your name or handle] <[email or website]> * [Date] [Brief summary of your changes] diff --git a/common_arm64.h b/common_arm64.h index 8a66a1702..4855493da 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -119,9 +119,9 @@ static inline int blas_quickdivide(blasint x, blasint y){ } #if defined(DOUBLE) -#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") +#define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory") #else -#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") +#define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory") #endif #define GET_IMAGE_CANCEL @@ -138,7 +138,6 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) #define PROLOGUE \ - .arm ;\ .global REALNAME ;\ .func REALNAME ;\ REALNAME: diff --git a/cpuid_arm64.c b/cpuid_arm64.c new file mode 100644 index 000000000..c7a27f891 --- /dev/null +++ b/cpuid_arm64.c @@ -0,0 +1,217 @@ +/************************************************************************** + Copyright (c) 2013, The OpenBLAS Project + All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include + +#define CPU_UNKNOWN 0 +#define CPU_ARMV8 1 + +static char *cpuname[] = { + "UNKOWN", + "ARMV8" +}; + + +int get_feature(char *search) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, search)) { return(1); } + } + +#endif + return(0); +} + + +int detect(void) +{ + +#ifdef linux + + FILE *infile; + char buffer[512], *p; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9))) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + if(p != NULL) + { + + if (strstr(p, "AArch64")) + { + return CPU_ARMV8; + + } + + + } +#endif + + return CPU_UNKNOWN; +} + +char *get_corename(void) +{ + return cpuname[detect()]; +} + +void get_architecture(void) +{ + printf("ARM"); +} + +void get_subarchitecture(void) +{ + int d = detect(); + switch (d) + { + + case CPU_ARMV8: + printf("ARMV8"); + break; + + default: + printf("UNKNOWN"); + break; + } +} + +void get_subdirname(void) +{ + printf("arm64"); +} + +void get_cpuconfig(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV8: + printf("#define ARMV8\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + + + } +} + + +void get_libname(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV8: + printf("armv8\n"); + break; + + } +} + + +void get_features(void) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + } + +#endif + return; +} + + diff --git a/getarch.c b/getarch.c index 3e9914259..ded347ecc 100644 --- a/getarch.c +++ b/getarch.c @@ -746,12 +746,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SUBARCHITECTURE "ARMV8" #define SUBDIRNAME "arm64" #define ARCHCONFIG "-DARMV8 " \ - "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ - "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ - "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4" + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " #define LIBNAME "armv8" -#define CORENAME "ARMV8" +#define CORENAME "XGENE1" #else #endif @@ -801,6 +800,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __aarch64__ +#include "cpuid_arm64.c" +#define OPENBLAS_SUPPORTED +#endif + #ifndef OPENBLAS_SUPPORTED #error "This arch/CPU is not supported by OpenBLAS." @@ -856,7 +860,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) +#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -956,7 +960,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) +#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 27157dad1..4fc0968cd 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -80,14 +80,14 @@ DGEMVTKERNEL = ../arm/gemv_t.c CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c -STRMMKERNEL = ../generic/trmmkernel_2x2.c +STRMMKERNEL = ../generic/trmmkernel_4x4.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMKERNEL = sgemm_kernel_4x4.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S new file mode 100644 index 000000000..78633297f --- /dev/null +++ b/kernel/arm64/sgemm_kernel_4x4.S @@ -0,0 +1,1327 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/11/02 Saar +* UNROLL_N 4 +* UNROLL_M 4 +* DGEMM_P 128 +* DGEMM_Q 240 +* DGEMM_R 12288 +* A_PRE 128 +* B_PRE 128 +* C_PRE 32 +* +* Performance on Odroid U2: +* +* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS +* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS +* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS +* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6*/ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc*/ + + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define pB x10 +#define counterJ x11 +#define tempALPHA x12 +#define pCRow0 x13 +#define pCRow1 x14 +#define pCRow2 x15 +#define pA x16 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 pB +// 11 counterJ +// 12 tempALPHA +// 13 pCRow0 +// 14 pCRow1 +// 15 pCRow2 +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 orig ALPHA -> a00 +//v01 a01 +//v02 a02 +//v03 a03 +//v04 a10 +//v05 a11 +//v06 a12 +//v07 a13 +//v08 must save b00 +//v09 must save b01 +//v10 must save b02 +//v11 must save b03 +//v12 must save b10 +//v13 must save b11 +//v14 must save b12 +//v15 must save b13 +//v16 must save C00 +//v17 must save C01 +//v18 C02 +//v19 C03 +//v20 C10 +//v21 C11 +//v22 C12 +//v23 C13 +//v24 C20 +//v25 C21 +//v26 C22 +//v27 C23 +//v28 C30 +//v29 C31 +//v30 C32 +//v31 C33 + +// add sp,sp,#-(6*16) +// stp x18,x19,[sp,#(0*16)] +// stp x20,x21,[sp,#(1*16)] + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + fsub v16.4s , v16.4s , v16.4s + fsub v20.4s , v20.4s , v20.4s + fsub v24.4s , v24.4s , v24.4s + fsub v28.4s , v28.4s , v28.4s + +.endm + +.macro KERNEL4x4_I + + ld1 {v8.2s},[pB],#8 + ld1 {v10.2s},[pB],#8 + ld1 {v0.4s},[pA],#16 + + fmulx v16.4s, v0.4s, v8.4s[0] + fmulx v20.4s, v0.4s, v8.4s[1] + fmulx v24.4s, v0.4s, v10.4s[0] + fmulx v28.4s, v0.4s, v10.4s[1] + + ld1 {v12.2s},[pB],#8 // for next round + ld1 {v14.2s},[pB],#8 // for next round + ld1 {v4.4s},[pA],#16 // for next round + + +.endm + + +.macro KERNEL4x4_M2 + + fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v14.s[1] + + ld1 {v8.2s},[pB],#8 + ld1 {v10.2s},[pB],#8 + ld1 {v0.4s},[pA],#16 + +.endm + + +.macro KERNEL4x4_M1 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v10.s[1] + + ld1 {v12.2s},[pB],#8 + ld1 {v14.2s},[pB],#8 + ld1 {v4.4s},[pA],#16 + +.endm + + + +.macro KERNEL4x4_E + + fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v14.s[1] + +.endm + + + + +.macro KERNEL4x4_SUB + + ld1 {v8.2s},[pB],#8 + ld1 {v10.2s},[pB],#8 + ld1 {v0.4s} , [pA],#16 + + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v10.s[1] + +.endm + + + + +.macro SAVE4x4 + + add pCRow1, pCRow0, LDC // create a second row pointer from the first row pointer + mov v0.d[0], tempALPHA + + ld1 {v8.4s},[pCRow0] // load 4 values of C from first row + fmla v8.4s ,v16.4s,v0.s[0] + st1 {v8.4s},[pCRow0],#16 // store C from first row + + ld1 {v12.4s},[pCRow1] // load 4 values of C from second row + fmla v12.4s ,v20.4s,v0.s[0] + st1 {v12.4s},[pCRow1] // store C from second row + + add pCRow2, pCRow1, LDC // Row2 points to third row + + ld1 {v8.4s},[pCRow2] // load 4 values of C from third row + fmla v8.4s ,v24.4s,v0.s[0] + st1 {v8.4s} ,[pCRow2] // store C from third row + + add pCRow1, pCRow2 , LDC // row1 points to fourth row + + ld1 {v12.4s},[pCRow1] // load 4 values of C from fourth row + fmla v12.4s ,v28.4s,v0.s[0] + st1 {v12.4s},[pCRow1] // store fourth row + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + fsub s16 , s16 , s16 + fmov s17, s16 + fmov s20, s16 + fmov s21, s16 + fmov s24, s16 + fmov s25, s16 + fmov s28, s16 + fmov s29, s16 + +.endm + + + +.macro KERNEL2x4_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + ldr s10, [ pB, #8 ] + ldr s11, [ pB, #12 ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + + fmadd s20 , s0, s9, s20 + fmadd s21 , s1, s9, s21 + + fmadd s24 , s0, s10, s24 + fmadd s25 , s1, s10, s25 + + fmadd s28 , s0, s11, s28 + fmadd s29 , s1, s11, s29 + add pA , pA, #8 + add pB , pB, #16 + +.endm + + #define F1ST( op1, op2, op3) fmadd op1, op2, op3, op1 + #define L1ST( op1, op2, op3) ldr op1, [op2, op3] + +.macro SAVE2x4 + + add pCRow1 , pCRow0, LDC + add pCRow2 , pCRow1, LDC + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0) + L1ST ( s9,pCRow0, #4 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + + str s8 , [pCRow0, #0] + str s9 , [pCRow0, #4 ] + + ldr s12, [pCRow1, #0] + ldr s13, [pCRow1, #4 ] + + F1ST ( s12, s0 , s20) + F1ST ( s13, s0 , s21) + + str s12, [pCRow1, #0] + str s13, [pCRow1, #4 ] + + L1ST ( s8,pCRow2 , #0) + L1ST ( s9,pCRow2 , #4 ) + + F1ST ( s8 , s0 , s24) + F1ST ( s9 , s0 , s25) + + str s8 , [pCRow2 , #0] + str s9 , [pCRow2 , #4 ] + + add pCRow1, pCRow2 , LDC + + ldr s12, [pCRow1, #0] + ldr s13, [pCRow1, #4 ] + + F1ST ( s12, s0 , s28) + F1ST ( s13, s0 , s29) + + str s12, [pCRow1, #0] + str s13, [pCRow1, #4 ] + + add pCRow0, pCRow0, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + fsub s16 , s16 , s16 + fmov s20, s16 + fmov s24, s16 + fmov s28, s16 + +.endm + + + +.macro KERNEL1x4_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + ldr s10, [ pB, #8 ] + ldr s11, [ pB, #12 ] + + ldr s0 , [ pA ] + + fmadd s16 , s0, s8, s16 + fmadd s20 , s0, s9, s20 + fmadd s24 , s0, s10, s24 + fmadd s28 , s0, s11, s28 + + add pA , pA, #4 + add pB , pB, #16 + +.endm + +.macro SAVE1x4 + + add pCRow1 , pCRow0, LDC + add pCRow2 , pCRow1, LDC + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0) + F1ST ( s8 , s0 , s16) + str s8 , [pCRow0, #0] + + L1ST ( s12,pCRow1, #0) + F1ST ( s12, s0 , s20) + str s12, [pCRow1, #0] + + L1ST ( s8,pCRow2 , #0) + F1ST ( s8 , s0 , s24) + str s8 , [pCRow2 , #0] + + add pCRow1, pCRow2 , LDC + + L1ST ( s12,pCRow1, #0) + F1ST ( s12, s0 , s28) + str s12, [pCRow1, #0] + + add pCRow0, pCRow0, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + fsub s16 , s16 , s16 + fmov s17, s16 + fmov s18, s16 + fmov s19, s16 + fmov s20, s16 + fmov s21, s16 + fmov s22, s16 + fmov s23, s16 + +.endm + + + +.macro KERNEL4x2_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + ldr s2 , [ pA, #8 ] + ldr s3 , [ pA, #12 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + fmadd s18 , s2, s8, s18 + fmadd s19 , s3, s8, s19 + + fmadd s20 , s0, s9, s20 + fmadd s21 , s1, s9, s21 + fmadd s22 , s2, s9, s22 + fmadd s23 , s3, s9, s23 + + add pA , pA, #16 + add pB , pB, #8 + +.endm + +.macro SAVE4x2 + + add pCRow1 , pCRow0, LDC + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0) + L1ST ( s9,pCRow0, #4 ) + L1ST ( s10,pCRow0, #8 ) + L1ST ( s11,pCRow0, #12 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + F1ST ( s10, s0 , s18) + F1ST ( s11, s0 , s19) + + str s8 , [pCRow0] + str s9 , [pCRow0, #4 ] + str s10, [pCRow0, #8 ] + str s11, [pCRow0, #12 ] + + L1ST ( s12,pCRow1, #0) + L1ST ( s13,pCRow1, #4 ) + L1ST ( s14,pCRow1, #8 ) + L1ST ( s15,pCRow1, #12 ) + + F1ST ( s12, s0 , s20) + F1ST ( s13, s0 , s21) + F1ST ( s14, s0 , s22) + F1ST ( s15, s0 , s23) + + str s12, [pCRow1] + str s13, [pCRow1, #4 ] + str s14, [pCRow1, #8 ] + str s15, [pCRow1, #12 ] + + add pCRow0, pCRow0, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + fsub s16 , s16 , s16 + fmov s17, s16 + fmov s20, s16 + fmov s21, s16 + +.endm + + + +.macro KERNEL2x2_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + + fmadd s20 , s0, s9, s20 + fmadd s21 , s1, s9, s21 + + add pA , pA, #8 + add pB , pB, #8 + +.endm + +.macro SAVE2x2 + + add pCRow1 , pCRow0, LDC + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0 ) + L1ST ( s9,pCRow0, #4 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + + str s8 , [pCRow0] + str s9 , [pCRow0, #4 ] + + L1ST ( s12,pCRow1, #0 ) + L1ST ( s13,pCRow1, #4 ) + + F1ST ( s12, s0 , s20) + F1ST ( s13, s0 , s21) + + str s12, [pCRow1] + str s13, [pCRow1, #4 ] + + add pCRow0, pCRow0, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + fsub s16 , s16 , s16 + fmov s20, s16 + +.endm + + + +.macro KERNEL1x2_SUB + + ldr s8 , [ pB ] + ldr s9 , [ pB, #4 ] + + ldr s0 , [ pA ] + fmadd s16 , s0, s8, s16 + fmadd s20 , s0, s9, s20 + + add pA , pA, #4 + add pB , pB, #8 + +.endm + +.macro SAVE1x2 + + add pCRow1 , pCRow0, LDC + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0) + F1ST ( s8 , s0 , s16) + str s8 , [pCRow0] + + L1ST ( s12,pCRow1, #0) + F1ST ( s12, s0 , s20) + str s12, [pCRow1] + + add pCRow0, pCRow0, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + fsub s16 , s16 , s16 + fmov s17, s16 + fmov s18, s16 + fmov s19, s16 + +.endm + + + +.macro KERNEL4x1_SUB + + ldr s8 , [ pB ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + ldr s2 , [ pA, #8 ] + ldr s3 , [ pA, #12 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + fmadd s18 , s2, s8, s18 + fmadd s19 , s3, s8, s19 + + add pA , pA, #16 + add pB , pB, #4 + +.endm + +.macro SAVE4x1 + + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0 ) + L1ST ( s9,pCRow0, #4 ) + L1ST ( s10,pCRow0, #8 ) + L1ST ( s11,pCRow0, #12 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + F1ST ( s10, s0 , s18) + F1ST ( s11, s0 , s19) + + str s8 , [pCRow0] + str s9 , [pCRow0, #4 ] + str s10, [pCRow0, #8 ] + str s11, [pCRow0, #12 ] + + add pCRow0, pCRow0, #16 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + fsub s16 , s16 , s16 + fmov s17, s16 + +.endm + + + +.macro KERNEL2x1_SUB + + ldr s8 , [ pB ] + + ldr s0 , [ pA ] + ldr s1 , [ pA, #4 ] + + fmadd s16 , s0, s8, s16 + fmadd s17 , s1, s8, s17 + + add pA , pA, #8 + add pB , pB, #4 + +.endm + +.macro SAVE2x1 + + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0 ) + L1ST ( s9,pCRow0, #4 ) + + F1ST ( s8 , s0 , s16) + F1ST ( s9 , s0 , s17) + + str s8 , [pCRow0] + str s9 , [pCRow0, #4 ] + + add pCRow0, pCRow0, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + fsub s16 , s16 , s16 + +.endm + + + +.macro KERNEL1x1_SUB + + ldr s8 , [ pB ] + + ldr s0 , [ pA ] + + fmadd s16 , s0, s8, s16 + + add pA , pA, #4 + add pB , pB, #4 + +.endm + +.macro SAVE1x1 + + + mov v0.d[0], tempALPHA + + L1ST ( s8,pCRow0, #0 ) + F1ST ( s8 , s0 , s16) + str s8 , [pCRow0] + + add pCRow0, pCRow0, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + add sp,sp,#-(5*16) + stp d8,d9,[sp,#(0*16)] + stp d10,d11,[sp,#(1*16)] + stp d12,d13,[sp,#(2*16)] + stp d14,d15,[sp,#(3*16)] + stp d16,d17,[sp,#(4*16)] + + mov tempALPHA, v0.d[0] + lsl LDC, LDC, #2 // ldc = ldc * 4 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble sgemm_kernel_L2_BEGIN + +sgemm_kernel_L4_BEGIN: + + mov pCRow0, pC // pCRow0 = C + add pC,pC,LDC, lsl #2 + + mov pA, origPA // pA = start of A array + + + +sgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble sgemm_kernel_L4_M2_BEGIN + +sgemm_kernel_L4_M4_20: + + mov pB, origPB + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M4_32 + + + + KERNEL4x4_I //do one in the K + KERNEL4x4_M2 //do another in the K + + subs counterL, counterL, #2 // subtract 2, since one is always done at the tail + ble sgemm_kernel_L4_M4_22a + .align 5 + +sgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M4_22 + +sgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_32: // less than 4 to do in the K direction + + tst counterL, #1 + ble sgemm_kernel_L4_M4_40 + + KERNEL4x4_I + + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + + +sgemm_kernel_L4_M4_40: + + INIT4x4 + + +sgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M4_100 + +sgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bne sgemm_kernel_L4_M4_46 + +sgemm_kernel_L4_M4_100: + + SAVE4x4 + +sgemm_kernel_L4_M4_END: + + subs counterI, counterI, #1 + bne sgemm_kernel_L4_M4_20 + + +sgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L4_M1_BEGIN + +sgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M2_40 + +sgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_22 + + +sgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M2_100 + +sgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_42 + +sgemm_kernel_L4_M2_100: + + SAVE2x4 + +sgemm_kernel_L4_M2_END: + + +sgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L4_END + +sgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M1_40 + +sgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_22 + + +sgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M1_100 + +sgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_42 + +sgemm_kernel_L4_M1_100: + + SAVE1x4 + + +sgemm_kernel_L4_END: + + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + + subs counterJ, counterJ , #1 // j-- + bgt sgemm_kernel_L4_BEGIN + + + +/*********************************************************************************************/ + +sgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble sgemm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble sgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + add pC , pC, LDC, lsl #1 + + mov pA, origPA // pA = A + + + +sgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble sgemm_kernel_L2_M2_BEGIN + +sgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M4_40 + .align 5 + +sgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_22 + + +sgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M4_100 + +sgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_42 + +sgemm_kernel_L2_M4_100: + + SAVE4x2 + +sgemm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L2_M4_20 + + +sgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L2_M1_BEGIN + +sgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M2_40 + +sgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_22 + + +sgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M2_100 + +sgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_42 + +sgemm_kernel_L2_M2_100: + + SAVE2x2 + +sgemm_kernel_L2_M2_END: + + +sgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L2_END + +sgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble sgemm_kernel_L2_M1_40 + +sgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_22 + + +sgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M1_100 + +sgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_42 + +sgemm_kernel_L2_M1_100: + + SAVE1x2 + + +sgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/*********************************************************************************************/ + +sgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble sgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pCRow0 , LDC // C01 is the current line, update pC to point to next + + mov pA, origPA // pA = A + + + +sgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble sgemm_kernel_L1_M2_BEGIN + +sgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M4_40 + .align 5 + +sgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_22 + + +sgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M4_100 + +sgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_42 + +sgemm_kernel_L1_M4_100: + + SAVE4x1 + +sgemm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L1_M4_20 + + +sgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L1_M1_BEGIN + +sgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M2_40 + +sgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_22 + + +sgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M2_100 + +sgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_42 + +sgemm_kernel_L1_M2_100: + + SAVE2x1 + +sgemm_kernel_L1_M2_END: + + +sgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L1_END + +sgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M1_40 + +sgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_22 + + +sgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M1_100 + +sgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_42 + +sgemm_kernel_L1_M1_100: + + SAVE1x1 + + +sgemm_kernel_L1_END: + + +sgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8,d9,[sp,#(0*16)] + ldp d10,d11,[sp,#(1*16)] + ldp d12,d13,[sp,#(2*16)] + ldp d14,d15,[sp,#(3*16)] + ldp d16,d17,[sp,#(4*16)] + add sp,sp,#(5*16) + ret + + EPILOGUE + diff --git a/kernel/generic/trmmkernel_4x4.c b/kernel/generic/trmmkernel_4x4.c new file mode 100644 index 000000000..a85828cad --- /dev/null +++ b/kernel/generic/trmmkernel_4x4.c @@ -0,0 +1,875 @@ +#include "common.h" +#include + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + + BLASLONG off, temp; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + + for (j=0; j