diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index ec692d5c2..e30261698 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -115,7 +115,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMONCOPY = zgemm_ncopy_2_vfpv3.S ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index dfe3e3634..7d83def94 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -77,7 +77,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define B [fp, #4 ] #define C [fp, #8 ] -#define OLD_LDC [fp, #12 ] +#define OLDdgemm_kernel_LDC [fp, #12 ] #define I r0 #define J r1 @@ -883,7 +883,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers - ldr r3, OLD_LDC + ldr r3, OLDdgemm_kernel_LDC lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC @@ -892,9 +892,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr J, N asrs J, J, #2 // J = J / 4 - ble _L2_BEGIN + ble dgemm_kernel_L2_BEGIN -_L4_BEGIN: +dgemm_kernel_L4_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC @@ -908,21 +908,19 @@ _L4_BEGIN: -_L4_M4_BEGIN: +dgemm_kernel_L4_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L4_M2_BEGIN + ble dgemm_kernel_L4_M2_BEGIN -_L4_M4_20: +dgemm_kernel_L4_M4_20: mov BO, BC asrs L , K1, #3 // L = L / 8 - cmp L , #3 - blt _L4_M4_30 - .align 5 - + cmp L , #2 + blt dgemm_kernel_L4_M4_32 KERNEL4x4_I @@ -935,9 +933,11 @@ _L4_M4_20: KERNEL4x4_M1 KERNEL4x4_M2 - sub L, L, #2 + subs L, L, #2 + ble dgemm_kernel_L4_M4_22a + .align 5 -_L4_M4_22: +dgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 @@ -950,7 +950,9 @@ _L4_M4_22: KERNEL4x4_M2 subs L, L, #1 - bgt _L4_M4_22 + bgt dgemm_kernel_L4_M4_22 + +dgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 @@ -962,43 +964,12 @@ _L4_M4_22: KERNEL4x4_M1 KERNEL4x4_E - b _L4_M4_44 + b dgemm_kernel_L4_M4_44 - -_L4_M4_30: - tst L, #3 - ble _L4_M4_40 - - tst L, #2 - ble _L4_M4_32 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - b _L4_M4_44 - -_L4_M4_32: +dgemm_kernel_L4_M4_32: tst L, #1 - ble _L4_M4_40 + ble dgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 @@ -1010,54 +981,54 @@ _L4_M4_32: KERNEL4x4_M1 KERNEL4x4_E - b _L4_M4_44 + b dgemm_kernel_L4_M4_44 -_L4_M4_40: +dgemm_kernel_L4_M4_40: INIT4x4 -_L4_M4_44: +dgemm_kernel_L4_M4_44: ands L , K1, #7 // L = L % 8 - ble _L4_M4_100 + ble dgemm_kernel_L4_M4_100 -_L4_M4_46: +dgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs L, L, #1 - bne _L4_M4_46 + bne dgemm_kernel_L4_M4_46 -_L4_M4_100: +dgemm_kernel_L4_M4_100: SAVE4x4 -_L4_M4_END: +dgemm_kernel_L4_M4_END: subs I, I, #1 - bne _L4_M4_20 + bne dgemm_kernel_L4_M4_20 -_L4_M2_BEGIN: +dgemm_kernel_L4_M2_BEGIN: ldr I, M tst I , #3 - ble _L4_END + ble dgemm_kernel_L4_END tst I, #2 // I = I / 2 - ble _L4_M1_BEGIN + ble dgemm_kernel_L4_M1_BEGIN -_L4_M2_20: +dgemm_kernel_L4_M2_20: INIT2x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L4_M2_40 + ble dgemm_kernel_L4_M2_40 -_L4_M2_22: +dgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1070,42 +1041,42 @@ _L4_M2_22: KERNEL2x4_SUB subs L, L, #1 - bgt _L4_M2_22 + bgt dgemm_kernel_L4_M2_22 -_L4_M2_40: +dgemm_kernel_L4_M2_40: ands L , K1, #7 // L = L % 8 - ble _L4_M2_100 + ble dgemm_kernel_L4_M2_100 -_L4_M2_42: +dgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs L, L, #1 - bgt _L4_M2_42 + bgt dgemm_kernel_L4_M2_42 -_L4_M2_100: +dgemm_kernel_L4_M2_100: SAVE2x4 -_L4_M2_END: +dgemm_kernel_L4_M2_END: -_L4_M1_BEGIN: +dgemm_kernel_L4_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L4_END + ble dgemm_kernel_L4_END -_L4_M1_20: +dgemm_kernel_L4_M1_20: INIT1x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L4_M1_40 + ble dgemm_kernel_L4_M1_40 -_L4_M1_22: +dgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1117,27 +1088,27 @@ _L4_M1_22: KERNEL1x4_SUB subs L, L, #1 - bgt _L4_M1_22 + bgt dgemm_kernel_L4_M1_22 -_L4_M1_40: +dgemm_kernel_L4_M1_40: ands L , K1, #7 // L = L % 8 - ble _L4_M1_100 + ble dgemm_kernel_L4_M1_100 -_L4_M1_42: +dgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs L, L, #1 - bgt _L4_M1_42 + bgt dgemm_kernel_L4_M1_42 -_L4_M1_100: +dgemm_kernel_L4_M1_100: SAVE1x4 -_L4_END: +dgemm_kernel_L4_END: mov r3, BC mov r4, K1 @@ -1146,20 +1117,20 @@ _L4_END: mov BC, r3 subs J , #1 // j-- - bgt _L4_BEGIN + bgt dgemm_kernel_L4_BEGIN /*********************************************************************************************/ -_L2_BEGIN: +dgemm_kernel_L2_BEGIN: ldr J , N tst J , #3 - ble _L999 + ble dgemm_kernel_L999 tst J , #2 - ble _L1_BEGIN + ble dgemm_kernel_L1_BEGIN ldr CO1, C // CO1 = C ldr r4 , LDC @@ -1168,28 +1139,25 @@ _L2_BEGIN: str r3 , C // store C ldr AO, A // AO = A - //pld [AO , #A_PRE-96] - //pld [AO , #A_PRE-64] - //pld [AO , #A_PRE-32] -_L2_M4_BEGIN: +dgemm_kernel_L2_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L2_M2_BEGIN + ble dgemm_kernel_L2_M2_BEGIN -_L2_M4_20: +dgemm_kernel_L2_M4_20: INIT4x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M4_40 + ble dgemm_kernel_L2_M4_40 .align 5 -_L2_M4_22: +dgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1201,49 +1169,49 @@ _L2_M4_22: KERNEL4x2_SUB subs L, L, #1 - bgt _L2_M4_22 + bgt dgemm_kernel_L2_M4_22 -_L2_M4_40: +dgemm_kernel_L2_M4_40: ands L , K1, #7 // L = L % 8 - ble _L2_M4_100 + ble dgemm_kernel_L2_M4_100 -_L2_M4_42: +dgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs L, L, #1 - bgt _L2_M4_42 + bgt dgemm_kernel_L2_M4_42 -_L2_M4_100: +dgemm_kernel_L2_M4_100: SAVE4x2 -_L2_M4_END: +dgemm_kernel_L2_M4_END: subs I, I, #1 - bgt _L2_M4_20 + bgt dgemm_kernel_L2_M4_20 -_L2_M2_BEGIN: +dgemm_kernel_L2_M2_BEGIN: ldr I, M tst I , #3 - ble _L2_END + ble dgemm_kernel_L2_END tst I, #2 // I = I / 2 - ble _L2_M1_BEGIN + ble dgemm_kernel_L2_M1_BEGIN -_L2_M2_20: +dgemm_kernel_L2_M2_20: INIT2x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M2_40 + ble dgemm_kernel_L2_M2_40 -_L2_M2_22: +dgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1256,42 +1224,42 @@ _L2_M2_22: KERNEL2x2_SUB subs L, L, #1 - bgt _L2_M2_22 + bgt dgemm_kernel_L2_M2_22 -_L2_M2_40: +dgemm_kernel_L2_M2_40: ands L , K1, #7 // L = L % 8 - ble _L2_M2_100 + ble dgemm_kernel_L2_M2_100 -_L2_M2_42: +dgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs L, L, #1 - bgt _L2_M2_42 + bgt dgemm_kernel_L2_M2_42 -_L2_M2_100: +dgemm_kernel_L2_M2_100: SAVE2x2 -_L2_M2_END: +dgemm_kernel_L2_M2_END: -_L2_M1_BEGIN: +dgemm_kernel_L2_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L2_END + ble dgemm_kernel_L2_END -_L2_M1_20: +dgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M1_40 + ble dgemm_kernel_L2_M1_40 -_L2_M1_22: +dgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1303,27 +1271,27 @@ _L2_M1_22: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_22 + bgt dgemm_kernel_L2_M1_22 -_L2_M1_40: +dgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 - ble _L2_M1_100 + ble dgemm_kernel_L2_M1_100 -_L2_M1_42: +dgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_42 + bgt dgemm_kernel_L2_M1_42 -_L2_M1_100: +dgemm_kernel_L2_M1_100: SAVE1x2 -_L2_END: +dgemm_kernel_L2_END: mov r3, BC mov r4, K1 @@ -1333,11 +1301,11 @@ _L2_END: /*********************************************************************************************/ -_L1_BEGIN: +dgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 - ble _L999 + ble dgemm_kernel_L999 ldr CO1, C // CO1 = C @@ -1346,28 +1314,25 @@ _L1_BEGIN: str r3 , C // store C ldr AO, A // AO = A - //pld [AO , #A_PRE-96] - //pld [AO , #A_PRE-64] - //pld [AO , #A_PRE-32] -_L1_M4_BEGIN: +dgemm_kernel_L1_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble _L1_M2_BEGIN + ble dgemm_kernel_L1_M2_BEGIN -_L1_M4_20: +dgemm_kernel_L1_M4_20: INIT4x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M4_40 + ble dgemm_kernel_L1_M4_40 .align 5 -_L1_M4_22: +dgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1379,49 +1344,49 @@ _L1_M4_22: KERNEL4x1_SUB subs L, L, #1 - bgt _L1_M4_22 + bgt dgemm_kernel_L1_M4_22 -_L1_M4_40: +dgemm_kernel_L1_M4_40: ands L , K1, #7 // L = L % 8 - ble _L1_M4_100 + ble dgemm_kernel_L1_M4_100 -_L1_M4_42: +dgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs L, L, #1 - bgt _L1_M4_42 + bgt dgemm_kernel_L1_M4_42 -_L1_M4_100: +dgemm_kernel_L1_M4_100: SAVE4x1 -_L1_M4_END: +dgemm_kernel_L1_M4_END: subs I, I, #1 - bgt _L1_M4_20 + bgt dgemm_kernel_L1_M4_20 -_L1_M2_BEGIN: +dgemm_kernel_L1_M2_BEGIN: ldr I, M tst I , #3 - ble _L1_END + ble dgemm_kernel_L1_END tst I, #2 // I = I / 2 - ble _L1_M1_BEGIN + ble dgemm_kernel_L1_M1_BEGIN -_L1_M2_20: +dgemm_kernel_L1_M2_20: INIT2x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M2_40 + ble dgemm_kernel_L1_M2_40 -_L1_M2_22: +dgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1434,42 +1399,42 @@ _L1_M2_22: KERNEL2x1_SUB subs L, L, #1 - bgt _L1_M2_22 + bgt dgemm_kernel_L1_M2_22 -_L1_M2_40: +dgemm_kernel_L1_M2_40: ands L , K1, #7 // L = L % 8 - ble _L1_M2_100 + ble dgemm_kernel_L1_M2_100 -_L1_M2_42: +dgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs L, L, #1 - bgt _L1_M2_42 + bgt dgemm_kernel_L1_M2_42 -_L1_M2_100: +dgemm_kernel_L1_M2_100: SAVE2x1 -_L1_M2_END: +dgemm_kernel_L1_M2_END: -_L1_M1_BEGIN: +dgemm_kernel_L1_M1_BEGIN: tst I, #1 // I = I % 2 - ble _L1_END + ble dgemm_kernel_L1_END -_L1_M1_20: +dgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M1_40 + ble dgemm_kernel_L1_M1_40 -_L1_M1_22: +dgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1481,30 +1446,30 @@ _L1_M1_22: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_22 + bgt dgemm_kernel_L1_M1_22 -_L1_M1_40: +dgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 - ble _L1_M1_100 + ble dgemm_kernel_L1_M1_100 -_L1_M1_42: +dgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_42 + bgt dgemm_kernel_L1_M1_42 -_L1_M1_100: +dgemm_kernel_L1_M1_100: SAVE1x1 -_L1_END: +dgemm_kernel_L1_END: -_L999: +dgemm_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers diff --git a/kernel/arm/dgemm_ncopy_4_vfpv3.S b/kernel/arm/dgemm_ncopy_4_vfpv3.S index bdb63bfdd..ad6692e50 100644 --- a/kernel/arm/dgemm_ncopy_4_vfpv3.S +++ b/kernel/arm/dgemm_ncopy_4_vfpv3.S @@ -26,10 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/10/11 Saar -* BLASTEST : xOK -* CTEST : xOK -* TEST : xOK +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK * **************************************************************************************/ @@ -68,7 +68,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define I r3 #define J r12 -#define A_PRE 96 +#define A_PRE 256 /************************************************************************************** * Macro definitions @@ -76,6 +76,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4 + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + pld [ AO3, #A_PRE ] + pld [ AO4, #A_PRE ] + fldd d0 , [ AO1, #0 ] fldd d1 , [ AO2, #0 ] fldd d2 , [ AO3, #0 ] @@ -199,12 +204,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr BO, B -_L4_BEGIN: +dgemm_ncopy_L4_BEGIN: asrs J, N, #2 // J = N / 4 - ble _L2_BEGIN + ble dgemm_ncopy_L2_BEGIN -_L4_M4_BEGIN: +dgemm_ncopy_L4_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA @@ -214,47 +219,47 @@ _L4_M4_BEGIN: add A , AO4, r4 // A = A + 4 * LDA asrs I, M, #2 // I = M / 4 - ble _L4_M4_40 + ble dgemm_ncopy_L4_M4_40 -_L4_M4_20: +dgemm_ncopy_L4_M4_20: COPY4x4 subs I , I , #1 - bne _L4_M4_20 + bne dgemm_ncopy_L4_M4_20 -_L4_M4_40: +dgemm_ncopy_L4_M4_40: ands I, M , #3 - ble _L4_M4_END + ble dgemm_ncopy_L4_M4_END -_L4_M4_60: +dgemm_ncopy_L4_M4_60: COPY1x4 subs I , I , #1 - bne _L4_M4_60 + bne dgemm_ncopy_L4_M4_60 -_L4_M4_END: +dgemm_ncopy_L4_M4_END: subs J , J, #1 // j-- - bne _L4_M4_BEGIN + bne dgemm_ncopy_L4_M4_BEGIN /*********************************************************************************************/ -_L2_BEGIN: +dgemm_ncopy_L2_BEGIN: tst N, #3 - ble _L999 + ble dgemm_ncopy_L999 tst N, #2 - ble _L1_BEGIN + ble dgemm_ncopy_L1_BEGIN -_L2_M4_BEGIN: +dgemm_ncopy_L2_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA @@ -262,75 +267,75 @@ _L2_M4_BEGIN: add A , AO2, r4 // A = A + 2 * LDA asrs I, M, #2 // I = M / 4 - ble _L2_M4_40 + ble dgemm_ncopy_L2_M4_40 -_L2_M4_20: +dgemm_ncopy_L2_M4_20: COPY4x2 subs I , I , #1 - bne _L2_M4_20 + bne dgemm_ncopy_L2_M4_20 -_L2_M4_40: +dgemm_ncopy_L2_M4_40: ands I, M , #3 - ble _L2_M4_END + ble dgemm_ncopy_L2_M4_END -_L2_M4_60: +dgemm_ncopy_L2_M4_60: COPY1x2 subs I , I , #1 - bne _L2_M4_60 + bne dgemm_ncopy_L2_M4_60 -_L2_M4_END: +dgemm_ncopy_L2_M4_END: /*********************************************************************************************/ -_L1_BEGIN: +dgemm_ncopy_L1_BEGIN: tst N, #1 - ble _L999 + ble dgemm_ncopy_L999 -_L1_M4_BEGIN: +dgemm_ncopy_L1_M4_BEGIN: mov AO1, A // AO1 = A ldr r4 , LDA add A , AO1, r4 // A = A + 1 * LDA asrs I, M, #2 // I = M / 4 - ble _L1_M4_40 + ble dgemm_ncopy_L1_M4_40 -_L1_M4_20: +dgemm_ncopy_L1_M4_20: COPY4x1 subs I , I , #1 - bne _L1_M4_20 + bne dgemm_ncopy_L1_M4_20 -_L1_M4_40: +dgemm_ncopy_L1_M4_40: ands I, M , #3 - ble _L1_M4_END + ble dgemm_ncopy_L1_M4_END -_L1_M4_60: +dgemm_ncopy_L1_M4_60: COPY1x1 subs I , I , #1 - bne _L1_M4_60 + bne dgemm_ncopy_L1_M4_60 -_L1_M4_END: +dgemm_ncopy_L1_M4_END: -_L999: +dgemm_ncopy_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 9c14aec10..2d35028a2 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2013/11/02 Saar +* 2013/11/05 Saar * BLASTEST : OK * CTEST : OK * TEST : OK @@ -924,9 +924,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr J, N asrs J, J, #1 // J = J / 2 - ble _L1_BEGIN + ble zgemm_kernel_L1_BEGIN -_L2_BEGIN: +zgemm_kernel_L2_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC @@ -940,19 +940,19 @@ _L2_BEGIN: -_L2_M2_BEGIN: +zgemm_kernel_L2_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 - ble _L2_M1_BEGIN + ble zgemm_kernel_L2_M1_BEGIN -_L2_M2_20: +zgemm_kernel_L2_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 - blt _L2_M2_30 + blt zgemm_kernel_L2_M2_30 .align 5 @@ -969,7 +969,7 @@ _L2_M2_20: sub L, L, #2 -_L2_M2_22: +zgemm_kernel_L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 @@ -982,7 +982,7 @@ _L2_M2_22: KERNEL2x2_M2 subs L, L, #1 - bgt _L2_M2_22 + bgt zgemm_kernel_L2_M2_22 KERNEL2x2_M1 KERNEL2x2_M2 @@ -994,15 +994,15 @@ _L2_M2_22: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b zgemm_kernel_L2_M2_44 -_L2_M2_30: +zgemm_kernel_L2_M2_30: tst L, #3 - ble _L2_M2_40 + ble zgemm_kernel_L2_M2_40 tst L, #2 - ble _L2_M2_32 + ble zgemm_kernel_L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 @@ -1025,12 +1025,12 @@ _L2_M2_30: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b zgemm_kernel_L2_M2_44 -_L2_M2_32: +zgemm_kernel_L2_M2_32: tst L, #1 - ble _L2_M2_40 + ble zgemm_kernel_L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 @@ -1042,51 +1042,51 @@ _L2_M2_32: KERNEL2x2_M1 KERNEL2x2_E - b _L2_M2_44 + b zgemm_kernel_L2_M2_44 -_L2_M2_40: +zgemm_kernel_L2_M2_40: INIT2x2 -_L2_M2_44: +zgemm_kernel_L2_M2_44: ands L , K1, #7 // L = L % 8 - ble _L2_M2_100 + ble zgemm_kernel_L2_M2_100 -_L2_M2_46: +zgemm_kernel_L2_M2_46: KERNEL2x2_SUB subs L, L, #1 - bne _L2_M2_46 + bne zgemm_kernel_L2_M2_46 -_L2_M2_100: +zgemm_kernel_L2_M2_100: SAVE2x2 -_L2_M2_END: +zgemm_kernel_L2_M2_END: subs I, I, #1 - bne _L2_M2_20 + bne zgemm_kernel_L2_M2_20 -_L2_M1_BEGIN: +zgemm_kernel_L2_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 - ble _L2_END + ble zgemm_kernel_L2_END -_L2_M1_20: +zgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L2_M1_40 + ble zgemm_kernel_L2_M1_40 -_L2_M1_22: +zgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB @@ -1099,27 +1099,27 @@ _L2_M1_22: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_22 + bgt zgemm_kernel_L2_M1_22 -_L2_M1_40: +zgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 - ble _L2_M1_100 + ble zgemm_kernel_L2_M1_100 -_L2_M1_42: +zgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 - bgt _L2_M1_42 + bgt zgemm_kernel_L2_M1_42 -_L2_M1_100: +zgemm_kernel_L2_M1_100: SAVE1x2 -_L2_END: +zgemm_kernel_L2_END: mov r3, BC mov r4, K1 @@ -1128,17 +1128,17 @@ _L2_END: mov BC, r3 subs J , #1 // j-- - bgt _L2_BEGIN + bgt zgemm_kernel_L2_BEGIN /*********************************************************************************************/ -_L1_BEGIN: +zgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 - ble _L999 + ble zgemm_kernel_L999 ldr CO1, C // CO1 = C @@ -1148,19 +1148,19 @@ _L1_BEGIN: ldr AO, A // AO = A -_L1_M2_BEGIN: +zgemm_kernel_L1_M2_BEGIN: ldr I, M asrs I, I, #1 // I = I / 2 - ble _L1_M1_BEGIN + ble zgemm_kernel_L1_M1_BEGIN -_L1_M2_20: +zgemm_kernel_L1_M2_20: mov BO, BC asrs L , K1, #3 // L = L / 8 cmp L , #3 - blt _L1_M2_30 + blt zgemm_kernel_L1_M2_30 .align 5 @@ -1177,7 +1177,7 @@ _L1_M2_20: sub L, L, #2 -_L1_M2_22: +zgemm_kernel_L1_M2_22: KERNEL2x1_M1 KERNEL2x1_M2 @@ -1190,7 +1190,7 @@ _L1_M2_22: KERNEL2x1_M2 subs L, L, #1 - bgt _L1_M2_22 + bgt zgemm_kernel_L1_M2_22 KERNEL2x1_M1 KERNEL2x1_M2 @@ -1202,15 +1202,15 @@ _L1_M2_22: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b zgemm_kernel_L1_M2_44 -_L1_M2_30: +zgemm_kernel_L1_M2_30: tst L, #3 - ble _L1_M2_40 + ble zgemm_kernel_L1_M2_40 tst L, #2 - ble _L1_M2_32 + ble zgemm_kernel_L1_M2_32 KERNEL2x1_I KERNEL2x1_M2 @@ -1233,12 +1233,12 @@ _L1_M2_30: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b zgemm_kernel_L1_M2_44 -_L1_M2_32: +zgemm_kernel_L1_M2_32: tst L, #1 - ble _L1_M2_40 + ble zgemm_kernel_L1_M2_40 KERNEL2x1_I KERNEL2x1_M2 @@ -1250,51 +1250,51 @@ _L1_M2_32: KERNEL2x1_M1 KERNEL2x1_E - b _L1_M2_44 + b zgemm_kernel_L1_M2_44 -_L1_M2_40: +zgemm_kernel_L1_M2_40: INIT2x1 -_L1_M2_44: +zgemm_kernel_L1_M2_44: ands L , K1, #7 // L = L % 8 - ble _L1_M2_100 + ble zgemm_kernel_L1_M2_100 -_L1_M2_46: +zgemm_kernel_L1_M2_46: KERNEL2x1_SUB subs L, L, #1 - bne _L1_M2_46 + bne zgemm_kernel_L1_M2_46 -_L1_M2_100: +zgemm_kernel_L1_M2_100: SAVE2x1 -_L1_M2_END: +zgemm_kernel_L1_M2_END: subs I, I, #1 - bne _L1_M2_20 + bne zgemm_kernel_L1_M2_20 -_L1_M1_BEGIN: +zgemm_kernel_L1_M1_BEGIN: ldr I, M tst I, #1 // I = I % 2 - ble _L1_END + ble zgemm_kernel_L1_END -_L1_M1_20: +zgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble _L1_M1_40 + ble zgemm_kernel_L1_M1_40 -_L1_M1_22: +zgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB @@ -1307,31 +1307,31 @@ _L1_M1_22: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_22 + bgt zgemm_kernel_L1_M1_22 -_L1_M1_40: +zgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 - ble _L1_M1_100 + ble zgemm_kernel_L1_M1_100 -_L1_M1_42: +zgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 - bgt _L1_M1_42 + bgt zgemm_kernel_L1_M1_42 -_L1_M1_100: +zgemm_kernel_L1_M1_100: SAVE1x1 -_L1_END: +zgemm_kernel_L1_END: -_L999: +zgemm_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers diff --git a/kernel/arm/zgemm_ncopy_2_vfpv3.S b/kernel/arm/zgemm_ncopy_2_vfpv3.S new file mode 100644 index 000000000..5ff8ee299 --- /dev/null +++ b/kernel/arm/zgemm_ncopy_2_vfpv3.S @@ -0,0 +1,254 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d4 , [ AO1, #16 ] + fldd d5 , [ AO1, #24 ] + + fldd d2 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d6 , [ AO2, #16 ] + fldd d7 , [ AO2, #24 ] + + fstmiad BO!, { d0 - d7 } + add AO2, AO2, #32 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + + add AO1, AO1, #16 + fstmiad BO!, { d0 - d3 } + add AO2, AO2, #16 + +.endm + +.macro COPY2x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO1, #16 ] + fldd d3 , [ AO1, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO1, AO1, #32 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + + fstmiad BO!, { d0 - d1 } + add AO1, AO1, #16 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #4 // lda = lda * 8 * 2 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + ldr BO, B + +/*********************************************************************************************/ + +zgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble zgemm_ncopy_L1_BEGIN + +zgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble zgemm_ncopy_L2_M2_40 + +zgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne zgemm_ncopy_L2_M2_20 + + +zgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble zgemm_ncopy_L2_M2_END + +zgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne zgemm_ncopy_L2_M2_60 + + +zgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne zgemm_ncopy_L2_M2_BEGIN + + +/*********************************************************************************************/ + +zgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble zgemm_ncopy_L999 + + +zgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble zgemm_ncopy_L1_M2_40 + +zgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne zgemm_ncopy_L1_M2_20 + + +zgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble zgemm_ncopy_L1_M2_END + +zgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne zgemm_ncopy_L1_M2_60 + + +zgemm_ncopy_L1_M2_END: + + + +zgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE +