LoongArch64: Add dgemv_t_8_lasx.S and dgemv_n_8_lasx.S V2
This commit is contained in:
		
							parent
							
								
									0f521ece25
								
							
						
					
					
						commit
						e8b571d245
					
				|  | @ -8,6 +8,9 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| DGEMVNKERNEL = dgemv_n_8_lasx.S | ||||
| DGEMVTKERNEL = dgemv_t_8_lasx.S | ||||
| endif | ||||
| 
 | ||||
| DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c | ||||
|  |  | |||
|  | @ -132,12 +132,16 @@ CSWAPKERNEL  = ../arm/zswap.c | |||
| ZSWAPKERNEL  = ../arm/zswap.c | ||||
| 
 | ||||
| SGEMVNKERNEL = ../arm/gemv_n.c | ||||
| ifndef DGEMVNKERNEL | ||||
| DGEMVNKERNEL = ../arm/gemv_n.c | ||||
| endif | ||||
| CGEMVNKERNEL = ../arm/zgemv_n.c | ||||
| ZGEMVNKERNEL = ../arm/zgemv_n.c | ||||
| 
 | ||||
| SGEMVTKERNEL = ../arm/gemv_t.c | ||||
| ifndef DGEMVTKERNEL | ||||
| DGEMVTKERNEL = ../arm/gemv_t.c | ||||
| endif | ||||
| CGEMVTKERNEL = ../arm/zgemv_t.c | ||||
| ZGEMVTKERNEL = ../arm/zgemv_t.c | ||||
| 
 | ||||
|  |  | |||
|  | @ -0,0 +1,546 @@ | |||
| /******************************************************************************* | ||||
| Copyright (c) 2023, The OpenBLAS Project | ||||
| All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | ||||
| met: | ||||
| 1. Redistributions of source code must retain the above copyright | ||||
| notice, this list of conditions and the following disclaimer. | ||||
| 2. Redistributions in binary form must reproduce the above copyright | ||||
| notice, this list of conditions and the following disclaimer in | ||||
| the documentation and/or other materials provided with the | ||||
| distribution. | ||||
| 3. Neither the name of the OpenBLAS project nor the names of | ||||
| its contributors may be used to endorse or promote products | ||||
| derived from this software without specific prior written permission. | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | ||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *******************************************************************************/ | ||||
| #define ASSEMBLER | ||||
| 
 | ||||
| #include "common.h" | ||||
| #include "loongarch64_asm.S" | ||||
| 
 | ||||
| /********************************************************************* | ||||
| * 2023/07/14 guxiwei | ||||
| *        UTEST                  : OK | ||||
| *        CTEST                  : OK | ||||
| *        TEST                   : OK | ||||
| * | ||||
| * | ||||
| *********************************************************************/ | ||||
| 
 | ||||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, | ||||
|  * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
|  */ | ||||
| #define M       $r4 | ||||
| #define N       $r5 | ||||
| #define ALPHA   $f0 | ||||
| #define A       $r7 | ||||
| #define LDA     $r8 | ||||
| #define X       $r9 | ||||
| #define INC_X   $r10 | ||||
| #define Y       $r11 | ||||
| #define INC_Y   $r6 | ||||
| 
 | ||||
| #define J       $r12 | ||||
| #define I       $r13 | ||||
| #define K       $r14 | ||||
| #define Y_ORG   $r15 | ||||
| #define OFFSET  $r16 | ||||
| #define K_LDA   $r17 | ||||
| #define M8      $r18 | ||||
| #define T0      $r19 | ||||
| #define PA0     $r20 | ||||
| #define PA1     $r23 | ||||
| #define PA2     $r24 | ||||
| #define PA3     $r25 | ||||
| #define PA4     $r26 | ||||
| #define PA5     $r27 | ||||
| #define PA6     $r28 | ||||
| #define PA7     $r29 | ||||
| 
 | ||||
| #define VALPHA  $xr1 | ||||
| #define X0      $xr2 | ||||
| #define X1      $xr3 | ||||
| #define X2      $xr4 | ||||
| #define X3      $xr5 | ||||
| #define X4      $xr6 | ||||
| #define X5      $xr7 | ||||
| #define X6      $xr8 | ||||
| #define X7      $xr9 | ||||
| #define Y0      $xr10 | ||||
| #define Y1      $xr11 | ||||
| #define A0      $xr12 | ||||
| #define A1      $xr13 | ||||
| #define A2      $xr14 | ||||
| #define A3      $xr15 | ||||
| #define A4      $xr16 | ||||
| #define A5      $xr17 | ||||
| #define A6      $xr18 | ||||
| #define A7      $xr19 | ||||
| #define A8      $xr20 | ||||
| #define A9      $xr21 | ||||
| #define A10     $xr22 | ||||
| #define A11     $xr23 | ||||
| #define A12     $xr24 | ||||
| #define A13     $xr25 | ||||
| #define A14     $xr26 | ||||
| #define A15     $xr27 | ||||
| 
 | ||||
| .macro DLOAD_X_8
 | ||||
|     GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ | ||||
|                    X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 | ||||
|     GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ | ||||
|                  X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_X_4
 | ||||
|     GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 | ||||
|     GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_X_2
 | ||||
|     GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08 | ||||
|     GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_X_1
 | ||||
|     GLDREPL xv, d, X0, X, 0x00 | ||||
|     GMUL xvf, d, X0, X0, VALPHA | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_Y_8
 | ||||
|     GLD xv, , Y0, Y, 0, Y1, Y, 0x20 | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_Y_4
 | ||||
|     GLD xv, , Y0, Y, 0 | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_Y_1
 | ||||
|     fld.d  $f10,   Y,  0 | ||||
| .endm | ||||
| 
 | ||||
| .macro DSTORE_Y_8
 | ||||
|     GST xv, , Y0, Y, 0, Y1, Y, 0x20 | ||||
| .endm | ||||
| 
 | ||||
| .macro DSTORE_Y_4
 | ||||
|     GST xv, , Y0, Y, 0 | ||||
| .endm | ||||
| 
 | ||||
| .macro DSTORE_Y_1
 | ||||
|     fst.d   $f10,   Y,  0 | ||||
| .endm | ||||
| 
 | ||||
| // Unable to use vector load/store ins | ||||
| .macro DLOAD_Y_8_GAP
 | ||||
|     fld.d   $f10,   Y,  0 | ||||
|     fldx.d  $f13,   Y,  INC_Y | ||||
|     PTR_ALSL  T0,   INC_Y,  Y,  1 | ||||
|     fld.d   $f14,   T0, 0 | ||||
|     fldx.d  $f15,   T0, INC_Y | ||||
|     PTR_ALSL  T0,   INC_Y,  Y,  2 | ||||
|     fld.d   $f11,   T0, 0 | ||||
|     fldx.d  $f17,   T0, INC_Y | ||||
|     PTR_ADD   T0,   T0, INC_Y | ||||
|     PTR_ADD   T0,   T0, INC_Y | ||||
|     fld.d   $f18,   T0, 0 | ||||
|     fldx.d  $f19,   T0, INC_Y | ||||
|     GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_Y_4_GAP
 | ||||
|     fld.d   $f10,   Y,  0 | ||||
|     fldx.d  $f13,   Y,  INC_Y | ||||
|     PTR_ALSL  T0,   INC_Y,  Y,  1 | ||||
|     fld.d   $f14,   T0, 0 | ||||
|     fldx.d  $f15,   T0, INC_Y | ||||
|     GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3 | ||||
| .endm | ||||
| 
 | ||||
| .macro DSTORE_Y_8_GAP
 | ||||
|     xvstelm.d   Y0,     Y,      0,      0 | ||||
|     PTR_ADD     T0,     Y,      INC_Y | ||||
|     xvstelm.d   Y0,     T0,     0,      1 | ||||
|     PTR_ADD     T0,     T0,     INC_Y | ||||
|     xvstelm.d   Y0,     T0,     0,      2 | ||||
|     PTR_ADD     T0,     T0,     INC_Y | ||||
|     xvstelm.d   Y0,     T0,     0,      3 | ||||
| 
 | ||||
|     PTR_ADD     T0,     T0,     INC_Y | ||||
|     xvstelm.d   Y1,     T0,     0,      0 | ||||
|     PTR_ADD     T0,     T0,     INC_Y | ||||
|     xvstelm.d   Y1,     T0,     0,      1 | ||||
|     PTR_ADD     T0,     T0,     INC_Y | ||||
|     xvstelm.d   Y1,     T0,     0,      2 | ||||
|     PTR_ADD     T0,     T0,     INC_Y | ||||
|     xvstelm.d   Y1,     T0,     0,      3 | ||||
| .endm | ||||
| 
 | ||||
| .macro DSTORE_Y_4_GAP
 | ||||
|     xvstelm.d   Y0,     Y,      0,      0 | ||||
|     PTR_ADD     T0,     Y,      INC_Y | ||||
|     xvstelm.d   Y0,     T0,     0,      1 | ||||
|     PTR_ADD     T0,     T0,     INC_Y | ||||
|     xvstelm.d   Y0,     T0,     0,      2 | ||||
|     PTR_ADD     T0,     T0,     INC_Y | ||||
|     xvstelm.d   Y0,     T0,     0,      3 | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_X_8_GAP
 | ||||
|     xvldrepl.d  X0,     X,      0x00 | ||||
|     PTR_ADD     T0,     X,      INC_X | ||||
|     xvldrepl.d  X1,     T0,     0x00 | ||||
|     PTR_ADD     T0,     T0,     INC_X | ||||
|     xvldrepl.d  X2,     T0,     0x00 | ||||
|     PTR_ADD     T0,     T0,     INC_X | ||||
|     xvldrepl.d  X3,     T0,     0x00 | ||||
|     PTR_ADD     T0,     T0,     INC_X | ||||
|     xvldrepl.d  X4,     T0,     0x00 | ||||
|     PTR_ADD     T0,     T0,     INC_X | ||||
|     xvldrepl.d  X5,     T0,     0x00 | ||||
|     PTR_ADD     T0,     T0,     INC_X | ||||
|     xvldrepl.d  X6,     T0,     0x00 | ||||
|     PTR_ADD     T0,     T0,     INC_X | ||||
|     xvldrepl.d  X7,     T0,     0x00 | ||||
|     GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ | ||||
|                  X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_X_4_GAP
 | ||||
|     xvldrepl.d  X0,     X,      0x00 | ||||
|     PTR_ADD     T0,     X,      INC_X | ||||
|     xvldrepl.d  X1,     T0,     0x00 | ||||
|     PTR_ADD     T0,     T0,     INC_X | ||||
|     xvldrepl.d  X2,     T0,     0x00 | ||||
|     PTR_ADD     T0,     T0,     INC_X | ||||
|     xvldrepl.d  X3,     T0,     0x00 | ||||
|     GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_X_2_GAP
 | ||||
|     xvldrepl.d  X0,     X,      0x00 | ||||
|     PTR_ADD     T0,     X,      INC_X | ||||
|     xvldrepl.d  X1,     T0,     0x00 | ||||
|     GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_N_8x8
 | ||||
|     GLD_INC xv, , 0x20,       \ | ||||
|     A0,  PA0, 0, A1,  PA0, 0, \ | ||||
|     A2,  PA1, 0, A3,  PA1, 0, \ | ||||
|     A4,  PA2, 0, A5,  PA2, 0, \ | ||||
|     A6,  PA3, 0, A7,  PA3, 0, \ | ||||
|     A8,  PA4, 0, A9,  PA4, 0, \ | ||||
|     A10, PA5, 0, A11, PA5, 0, \ | ||||
|     A12, PA6, 0, A13, PA6, 0, \ | ||||
|     A14, PA7, 0, A15, PA7, 0 | ||||
| 
 | ||||
|     GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \ | ||||
|                   Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \ | ||||
|                   Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \ | ||||
|                   Y0, A6,  X3, Y0, Y1, A7,  X3, Y1, \ | ||||
|                   Y0, A8,  X4, Y0, Y1, A9,  X4, Y1, \ | ||||
|                   Y0, A10, X5, Y0, Y1, A11, X5, Y1, \ | ||||
|                   Y0, A12, X6, Y0, Y1, A13, X6, Y1, \ | ||||
|                   Y0, A14, X7, Y0, Y1, A15, X7, Y1 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_N_4x8
 | ||||
|     GLD_INC xv, , 0x20, A0,  PA0, 0, \ | ||||
|                         A2,  PA1, 0, \ | ||||
|                         A4,  PA2, 0, \ | ||||
|                         A6,  PA3, 0, \ | ||||
|                         A8,  PA4, 0, \ | ||||
|                         A10, PA5, 0, \ | ||||
|                         A12, PA6, 0, \ | ||||
|                         A14, PA7, 0 | ||||
| 
 | ||||
|     GMADD xvf, d, Y0, A0,  X0, Y0, \ | ||||
|                   Y0, A2,  X1, Y0, \ | ||||
|                   Y0, A4,  X2, Y0, \ | ||||
|                   Y0, A6,  X3, Y0, \ | ||||
|                   Y0, A8,  X4, Y0, \ | ||||
|                   Y0, A10, X5, Y0, \ | ||||
|                   Y0, A12, X6, Y0, \ | ||||
|                   Y0, A14, X7, Y0 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_N_1x8
 | ||||
|     GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ | ||||
|                         $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 | ||||
|     GMADD f, d, $f10, $f12, $f2, $f10, \ | ||||
|                 $f10, $f14, $f3, $f10, \ | ||||
|                 $f10, $f16, $f4, $f10, \ | ||||
|                 $f10, $f18, $f5, $f10, \ | ||||
|                 $f10, $f20, $f6, $f10, \ | ||||
|                 $f10, $f22, $f7, $f10, \ | ||||
|                 $f10, $f24, $f8, $f10, \ | ||||
|                 $f10, $f26, $f9, $f10, | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_N_8x4
 | ||||
|     GLD_INC xv, , 0x20,       \ | ||||
|     A0,  PA0, 0, A1,  PA0, 0, \ | ||||
|     A2,  PA1, 0, A3,  PA1, 0, \ | ||||
|     A4,  PA2, 0, A5,  PA2, 0, \ | ||||
|     A6,  PA3, 0, A7,  PA3, 0 | ||||
| 
 | ||||
|     GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \ | ||||
|                   Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \ | ||||
|                   Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \ | ||||
|                   Y0, A6,  X3, Y0, Y1, A7,  X3, Y1 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_N_4x4
 | ||||
|     GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 | ||||
| 
 | ||||
|     GMADD xvf, d, Y0, A0,  X0, Y0, Y0, A2,  X1, Y0, \ | ||||
|                   Y0, A4,  X2, Y0, Y0, A6,  X3, Y0 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_N_1x4
 | ||||
|     GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 | ||||
|     GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \ | ||||
|                 $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_N_8x2
 | ||||
|     GLD_INC xv, , 0x20,       \ | ||||
|     A0,  PA0, 0, A1,  PA0, 0, \ | ||||
|     A2,  PA1, 0, A3,  PA1, 0 | ||||
|     GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \ | ||||
|                   Y0, A2,  X1, Y0, Y1, A3,  X1, Y1 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_N_4x2
 | ||||
|     GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0 | ||||
|     GMADD xvf, d, Y0, A0,  X0, Y0, \ | ||||
|                   Y0, A2,  X1, Y0 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_N_1x2
 | ||||
|     GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0 | ||||
|     GMADD f, d, $f10, $f12, $f2, $f10, \ | ||||
|                 $f10, $f14, $f3, $f10 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_N_1x1
 | ||||
|     fld.d   $f12,    PA0,    0 | ||||
|     PTR_ADDI PA0,   PA0,    0x08 | ||||
|     fmadd.d $f10,   $f12,   $f2,    $f10 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req | ||||
|     PTR_SRLI  J,      N,      3 | ||||
|     beqz      J,      .L_\XW\()_N_7 | ||||
|     PTR_SLLI  K_LDA,  LDA,    3 | ||||
|     PTR_SUB   K_LDA,  K_LDA,  M8 | ||||
| .L_\XW\()_N_L8: | ||||
|     DLOAD_\X_8 | ||||
|     xor     K,      K,      K | ||||
|     move    Y,      Y_ORG | ||||
|     PTR_SRLI  I,      M,       3 | ||||
|     beqz      I,      .L_\XW\()_M_7 | ||||
| .align 5
 | ||||
| .L_\XW\()_M_L8: | ||||
|     DLOAD_\Y_8 | ||||
|     DGEMV_N_8x8 | ||||
|     DSTORE_\Y_8 | ||||
|     PTR_ADDI    I,      I,      -1 | ||||
|     PTR_ALSL    Y,      INC_Y,  Y,  3 | ||||
|     PTR_ADDI    K,      K,      8 | ||||
|     bnez        I,      .L_\XW\()_M_L8 | ||||
| .L_\XW\()_M_7: | ||||
|     andi        I,      M,      4 | ||||
|     beqz        I,      .L_\XW\()_M_3 | ||||
|     DLOAD_\Y_4 | ||||
|     DGEMV_N_4x8 | ||||
|     DSTORE_\Y_4 | ||||
|     PTR_ALSL    Y,      INC_Y,  Y,  2 | ||||
|     PTR_ADDI    K,      K,      4 | ||||
| .L_\XW\()_M_3: | ||||
|     andi        I,      M,      3 | ||||
|     beqz        I,      .L_\XW\()_M_END | ||||
| .align 5
 | ||||
| .L_\XW\()_M_L1: | ||||
|     DLOAD_\Y_1 | ||||
|     DGEMV_N_1x8 | ||||
|     DSTORE_\Y_1 | ||||
|     PTR_ADDI    I,      I,      -1 | ||||
|     PTR_ADD     Y,      Y,      INC_Y | ||||
|     PTR_ADDI    K,      K,      1 | ||||
|     bnez        I,      .L_\XW\()_M_L1 | ||||
| .L_\XW\()_M_END: | ||||
|     PTR_ADDI    J,      J,      -1 | ||||
| #if __loongarch_grlen == 64 | ||||
|     GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | ||||
|               PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | ||||
| #else | ||||
|     GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | ||||
|               PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | ||||
| #endif | ||||
|     PTR_ALSL    X,      INC_X,  X,  3 | ||||
|     bnez        J,      .L_\XW\()_N_L8 | ||||
| .L_\XW\()_N_7: | ||||
|     andi        J,      N,      4 | ||||
|     beqz        J,      .L_\XW\()_N_3 | ||||
|     DLOAD_\X_4 | ||||
|     xor         K,      K,      K | ||||
|     move        Y,      Y_ORG | ||||
| 
 | ||||
|     PTR_SRLI  I,      M,       3 | ||||
|     beqz      I,      .L_\XW\()_N_4_M_7 | ||||
| .align 5
 | ||||
| .L_\XW\()_N_4_M_L8: | ||||
|     DLOAD_\Y_8 | ||||
|     DGEMV_N_8x4 | ||||
|     DSTORE_\Y_8 | ||||
|     PTR_ADDI  I,      I,      -1 | ||||
|     PTR_ADDI  K,      K,      8 | ||||
|     PTR_ALSL  Y,      INC_Y,  Y,  3 | ||||
|     bnez    I,      .L_\XW\()_N_4_M_L8 | ||||
| .L_\XW\()_N_4_M_7: | ||||
|     andi    I,      M,      4 | ||||
|     beqz    I,      .L_\XW\()_N_4_M_3 | ||||
|     DLOAD_\Y_4 | ||||
|     DGEMV_N_4x4 | ||||
|     DSTORE_\Y_4 | ||||
|     PTR_ALSL  Y,    INC_Y,  Y,  2 | ||||
|     PTR_ADDI  K,    K,      4 | ||||
| .L_\XW\()_N_4_M_3: | ||||
|     andi        I,      M,      3 | ||||
|     beqz        I,      .L_\XW\()_N_4_M_END | ||||
| .align 5
 | ||||
| .L_\XW\()_N_4_M_L1: | ||||
|     DLOAD_\Y_1 | ||||
|     DGEMV_N_1x4 | ||||
|     DSTORE_\Y_1 | ||||
|     PTR_ADDI    I,      I,      -1 | ||||
|     PTR_ADD     Y,      Y,      INC_Y | ||||
|     PTR_ADDI    K,      K,      1 | ||||
|     bnez        I,      .L_\XW\()_N_4_M_L1 | ||||
| .L_\XW\()_N_4_M_END: | ||||
|     PTR_SLLI    K_LDA,  LDA,    2 | ||||
|     PTR_SUB     K_LDA,  K_LDA,  M8 | ||||
| #if __loongarch_grlen == 64 | ||||
|     GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | ||||
| #else | ||||
|     GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | ||||
| #endif | ||||
|     PTR_ALSL    X,      INC_X,  X,  2 | ||||
| .L_\XW\()_N_3: | ||||
|     andi        J,      N,      2 | ||||
|     beqz        J,      .L_\XW\()_N_1 | ||||
|     DLOAD_\X_2 | ||||
|     xor         K,      K,      K | ||||
|     move        Y,      Y_ORG | ||||
|     PTR_SRLI    I,      M,       3 | ||||
|     beqz    I,      .L_\XW\()_N_2_M_7 | ||||
| .align 5
 | ||||
| .L_\XW\()_N_2_M_L8: | ||||
|     DLOAD_\Y_8 | ||||
|     DGEMV_N_8x2 | ||||
|     DSTORE_\Y_8 | ||||
|     PTR_ADDI  I,      I,      -1 | ||||
|     PTR_ADDI  K,      K,      8 | ||||
|     PTR_ALSL  Y,      INC_Y,  Y,  3 | ||||
|     bnez    I,      .L_\XW\()_N_2_M_L8 | ||||
| .L_\XW\()_N_2_M_7: | ||||
|     andi    I,      M,      4 | ||||
|     beqz    I,      .L_\XW\()_N_2_M_3 | ||||
|     DLOAD_\Y_4 | ||||
|     DGEMV_N_4x2 | ||||
|     DSTORE_\Y_4 | ||||
|     PTR_ALSL  Y,        INC_Y,      Y,      2 | ||||
|     PTR_ADDI  K,        K,          4 | ||||
| .L_\XW\()_N_2_M_3: | ||||
|     andi    I,      M,      3 | ||||
|     beqz    I,      .L_\XW\()_N_2_M_END | ||||
| .align 5
 | ||||
| .L_\XW\()_N_2_M_L1: | ||||
|     DLOAD_\Y_1 | ||||
|     DGEMV_N_1x2 | ||||
|     DSTORE_\Y_1 | ||||
|     PTR_ADDI  I,      I,      -1 | ||||
|     PTR_ADD   Y,      Y,      INC_Y | ||||
|     PTR_ADDI  K,      K,      1 | ||||
|     bnez    I,      .L_\XW\()_N_2_M_L1 | ||||
| .L_\XW\()_N_2_M_END: | ||||
|     PTR_SLLI    K_LDA,  LDA,    1 | ||||
|     PTR_SUB     K_LDA,  K_LDA,  M8 | ||||
|     PTR_ADD     PA0,    PA0,    K_LDA | ||||
|     PTR_ADD     PA1,    PA1,    K_LDA | ||||
|     PTR_ALSL    X,      INC_X,  X,  1 | ||||
| .L_\XW\()_N_1: | ||||
|     andi    J,      N,      1 | ||||
|     beqz    J,      .L_END | ||||
|     DLOAD_\X_1 | ||||
|     xor     K,      K,      K | ||||
|     move    Y,      Y_ORG | ||||
|     move    I,      M | ||||
|     beqz    I,      .L_END | ||||
| .align 5
 | ||||
| .L_\XW\()_N_1_M_L1: | ||||
|     DLOAD_\Y_1 | ||||
|     DGEMV_N_1x1 | ||||
|     DSTORE_\Y_1 | ||||
|     PTR_ADDI  I,      I,      -1 | ||||
|     PTR_ADD   Y,      Y,      INC_Y | ||||
|     PTR_ADDI  K,      K,      1 | ||||
|     bnez    I,      .L_\XW\()_N_1_M_L1 | ||||
|     b .L_END | ||||
| .endm | ||||
| 
 | ||||
|     PROLOGUE | ||||
|     PTR_LD     INC_Y,  $sp,    0 | ||||
|     push_if_used 17 + 7, 24 + 4 | ||||
|     PTR_ADDI   K,      $r0,     0x01 | ||||
|     PTR_SUB    I,      INC_X,   K | ||||
|     PTR_SUB    J,      INC_Y,   K | ||||
|     maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
 | ||||
|     maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
 | ||||
|     PTR_ALSL   I,      I,       J,      1 | ||||
|     GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | ||||
|     xvreplve0.d     VALPHA, $xr0 | ||||
|     move     Y_ORG,  Y | ||||
|     move     PA0,    A | ||||
| #if __loongarch_grlen == 64 | ||||
|     GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | ||||
|               PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | ||||
| #else | ||||
|     GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | ||||
|               PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | ||||
| #endif | ||||
|     la.local    T0,     .L_GAP_TABLE | ||||
|     PTR_ALSL    I,      I,      T0,     1 | ||||
|     ld.h        K,      I,      0 | ||||
|     PTR_ADD     T0,     T0,     K | ||||
|     jirl        $r0,    T0,     0 | ||||
| .L_GAP_TABLE: | ||||
|     .hword  .L_GAP_0_0 - .L_GAP_TABLE | ||||
|     .hword  .L_GAP_0_1 - .L_GAP_TABLE | ||||
|     .hword  .L_GAP_1_0 - .L_GAP_TABLE | ||||
|     .hword  .L_GAP_1_1 - .L_GAP_TABLE | ||||
| .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ | ||||
|     DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 | ||||
| .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ | ||||
|     DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 | ||||
| .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ | ||||
|     DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 | ||||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | ||||
|     DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 | ||||
| .L_END: | ||||
|     pop_if_used 17 + 7, 24 + 4 | ||||
|     jirl    $r0, $r1, 0x0 | ||||
|     EPILOGUE | ||||
|  | @ -0,0 +1,468 @@ | |||
| /******************************************************************************* | ||||
| Copyright (c) 2023, The OpenBLAS Project | ||||
| All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | ||||
| met: | ||||
| 1. Redistributions of source code must retain the above copyright | ||||
| notice, this list of conditions and the following disclaimer. | ||||
| 2. Redistributions in binary form must reproduce the above copyright | ||||
| notice, this list of conditions and the following disclaimer in | ||||
| the documentation and/or other materials provided with the | ||||
| distribution. | ||||
| 3. Neither the name of the OpenBLAS project nor the names of | ||||
| its contributors may be used to endorse or promote products | ||||
| derived from this software without specific prior written permission. | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | ||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *******************************************************************************/ | ||||
| #define ASSEMBLER | ||||
| 
 | ||||
| #include "common.h" | ||||
| #include "loongarch64_asm.S" | ||||
| 
 | ||||
| /********************************************************************* | ||||
| * 2023/07/17 guxiwei | ||||
| *        UTEST                  : OK | ||||
| *        CTEST                  : OK | ||||
| *        TEST                   : OK | ||||
| * | ||||
| * | ||||
| *********************************************************************/ | ||||
| 
 | ||||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, | ||||
|  * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
|  */ | ||||
| #define M       $r4 | ||||
| #define N       $r5 | ||||
| #define ALPHA   $f0 | ||||
| #define A       $r7 | ||||
| #define LDA     $r8 | ||||
| #define X       $r9 | ||||
| #define INC_X   $r10 | ||||
| #define Y       $r11 | ||||
| #define INC_Y   $r6 | ||||
| 
 | ||||
| #define J       $r12 | ||||
| #define I       $r13 | ||||
| #define K       $r14 | ||||
| #define PY0     $r14 | ||||
| #define X_ORG   $r15 | ||||
| #define PY1     $r16 | ||||
| #define K_LDA   $r17 | ||||
| #define PY2     $r18 | ||||
| #define T0      $r19 | ||||
| #define PA0     $r20 | ||||
| #define PA1     $r23 | ||||
| #define PA2     $r24 | ||||
| #define PA3     $r25 | ||||
| #define PA4     $r26 | ||||
| #define PA5     $r27 | ||||
| #define PA6     $r28 | ||||
| #define PA7     $r29 | ||||
| #define M8      $r30 | ||||
| 
 | ||||
| #define VALPHA  $xr0 | ||||
| #define X0      $xr1 | ||||
| #define X1      $xr2 | ||||
| #define A0      $xr3 | ||||
| #define A1      $xr4 | ||||
| #define A2      $xr5 | ||||
| #define A3      $xr6 | ||||
| #define A4      $xr7 | ||||
| #define A5      $xr8 | ||||
| #define A6      $xr9 | ||||
| #define A7      $xr10 | ||||
| #define A8      $xr11 | ||||
| #define A9      $xr12 | ||||
| #define A10     $xr13 | ||||
| #define A11     $xr14 | ||||
| #define A12     $xr15 | ||||
| #define A13     $xr16 | ||||
| #define A14     $xr17 | ||||
| #define A15     $xr18 | ||||
| #define TP0     $xr19 | ||||
| #define TP1     $xr20 | ||||
| #define TP2     $xr21 | ||||
| #define TP3     $xr22 | ||||
| #define TP4     $xr23 | ||||
| #define TP5     $xr24 | ||||
| #define TP6     $xr25 | ||||
| #define TP7     $xr26 | ||||
| #define Y0      $xr3 | ||||
| #define Y1      $xr4 | ||||
| #define Y2      $xr5 | ||||
| #define Y3      $xr6 | ||||
| #define Y4      $xr7 | ||||
| #define Y5      $xr8 | ||||
| #define Y6      $xr9 | ||||
| #define Y7      $xr10 | ||||
| 
 | ||||
| .macro ZERO_Y8
 | ||||
|     GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ | ||||
|                 TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 | ||||
| .endm | ||||
| 
 | ||||
| .macro ZERO_Y4
 | ||||
|     GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 | ||||
| .endm | ||||
| 
 | ||||
| .macro ZERO_Y2
 | ||||
|     GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 | ||||
| .endm | ||||
| 
 | ||||
| .macro ZERO_Y1
 | ||||
|     GXOR xv, v, TP0, TP0, TP0 | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_X8
 | ||||
|     GLD xv, , X0, X, 0x00, X1, X, 0x20 | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_X4
 | ||||
|     GLD xv, , X0, X, 0x00 | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_X8_GAP
 | ||||
|     fld.d       $f1,    X,    0x00 | ||||
|     fldx.d      $f2,    X,    INC_X | ||||
|     PTR_ALSL    T0,     INC_X,      X,      1 | ||||
|     fld.d       $f3,    T0,   0x00 | ||||
|     fldx.d      $f4,    T0,   INC_X | ||||
|     GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 | ||||
|     PTR_ALSL    T0,     INC_X,      X,      2 | ||||
|     fld.d       $f2,    T0,   0x00 | ||||
|     fldx.d      $f3,    T0,   INC_X | ||||
|     PTR_ALSL    T0,     INC_X,      T0,     1 | ||||
|     fld.d       $f4,    T0,   0x00 | ||||
|     fldx.d      $f5,    T0,   INC_X | ||||
|     GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 | ||||
| .endm | ||||
| 
 | ||||
| .macro DLOAD_X4_GAP
 | ||||
|     fld.d       $f1,    X,    0x00 | ||||
|     fldx.d      $f2,    X,    INC_X | ||||
|     PTR_ALSL    T0,     INC_X,      X,      1 | ||||
|     fld.d       $f3,    T0,   0x00 | ||||
|     fldx.d      $f4,    T0,   INC_X | ||||
|     GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_T_8x8
 | ||||
|     GLD_INC xv, , 0x20,       \ | ||||
|     A0,  PA0, 0, A1,  PA0, 0, \ | ||||
|     A2,  PA1, 0, A3,  PA1, 0, \ | ||||
|     A4,  PA2, 0, A5,  PA2, 0, \ | ||||
|     A6,  PA3, 0, A7,  PA3, 0, \ | ||||
|     A8,  PA4, 0, A9,  PA4, 0, \ | ||||
|     A10, PA5, 0, A11, PA5, 0, \ | ||||
|     A12, PA6, 0, A13, PA6, 0, \ | ||||
|     A14, PA7, 0, A15, PA7, 0 | ||||
| 
 | ||||
|     GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \ | ||||
|                   TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \ | ||||
|                   TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \ | ||||
|                   TP3, A6,  X0, TP3, TP3, A7,  X1, TP3, \ | ||||
|                   TP4, A8,  X0, TP4, TP4, A9,  X1, TP4, \ | ||||
|                   TP5, A10, X0, TP5, TP5, A11, X1, TP5, \ | ||||
|                   TP6, A12, X0, TP6, TP6, A13, X1, TP6, \ | ||||
|                   TP7, A14, X0, TP7, TP7, A15, X1, TP7 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_T_8x4
 | ||||
|     GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0, \ | ||||
|                         A8,  PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0 | ||||
| 
 | ||||
|     GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \ | ||||
|                   TP2, A4,  X0, TP2, TP3, A6,  X0, TP3, \ | ||||
|                   TP4, A8,  X0, TP4, TP5, A10, X0, TP5, \ | ||||
|                   TP6, A12, X0, TP6, TP7, A14, X0, TP7, | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_T_4x8
 | ||||
|     GLD_INC xv, , 0x20,       \ | ||||
|     A0,  PA0, 0, A1,  PA0, 0, \ | ||||
|     A2,  PA1, 0, A3,  PA1, 0, \ | ||||
|     A4,  PA2, 0, A5,  PA2, 0, \ | ||||
|     A6,  PA3, 0, A7,  PA3, 0 | ||||
| 
 | ||||
|     GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \ | ||||
|                   TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \ | ||||
|                   TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \ | ||||
|                   TP3, A6,  X0, TP3, TP3, A7,  X1, TP3 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_T_4x4
 | ||||
|     GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0 | ||||
| 
 | ||||
|     GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \ | ||||
|                   TP2, A4,  X0, TP2, TP3, A6,  X0, TP3 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_T_2x8
 | ||||
|     GLD_INC xv, , 0x20, A0,  PA0, 0, A1,  PA0, 0, A2, PA1, 0, A3,  PA1, 0 | ||||
| 
 | ||||
|     GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \ | ||||
|                   TP1, A2,  X0, TP1, TP1, A3,  X1, TP1 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_T_2x4
 | ||||
|     GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0 | ||||
| 
 | ||||
|     GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1 | ||||
| .endm | ||||
| 
 | ||||
| .macro DGEMV_T XW:req X8:req, X4:req | ||||
|     PTR_SRLI  J,      N,      3 | ||||
|     beqz      J,      .L_\XW\()_N_7 | ||||
|     PTR_SLLI  K_LDA,  LDA,    3 | ||||
|     PTR_SUB   K_LDA,  K_LDA,  M8 | ||||
| .L_\XW\()_N_L8: | ||||
|     ZERO_Y8 | ||||
|     move      X,      X_ORG | ||||
|     PTR_SRLI  I,      M,       3 | ||||
|     beqz      I,      .L_\XW\()_M_7 | ||||
| .align 5
 | ||||
| .L_\XW\()_M_L8: | ||||
|     DLOAD_\X8 | ||||
|     DGEMV_T_8x8 | ||||
|     PTR_ADDI    I,  I,  -1 | ||||
|     PTR_ALSL    X,  INC_X,  X,  3 | ||||
|     bnez        I,  .L_\XW\()_M_L8 | ||||
| .L_\XW\()_M_7: | ||||
|     andi        I,      M,      4 | ||||
|     beqz        I,      .L_\XW\()_M_3 | ||||
|     DLOAD_\X4 | ||||
|     DGEMV_T_8x4 | ||||
|     PTR_ALSL    X,  INC_X,  X,  2 | ||||
| .L_\XW\()_M_3: | ||||
|     // Accumulated | ||||
|     GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ | ||||
|                  Y5, TP5, Y6, TP6, Y7, TP7 | ||||
|     andi        I,      M,      3 | ||||
|     beqz        I,      .L_\XW\()_M_END | ||||
| .align 5
 | ||||
| .L_\XW\()_M_L1: | ||||
|     fld.d   $f1,    X,      0x00 | ||||
|     fld.d   $f11,   PA0,    0x00 | ||||
|     fld.d   $f12,   PA1,    0x00 | ||||
|     fld.d   $f13,   PA2,    0x00 | ||||
|     fld.d   $f14,   PA3,    0x00 | ||||
|     fld.d   $f15,   PA4,    0x00 | ||||
|     fld.d   $f16,   PA5,    0x00 | ||||
|     fld.d   $f17,   PA6,    0x00 | ||||
|     fld.d   $f18,   PA7,    0x00 | ||||
| #if __loongarch_grlen == 64 | ||||
|     GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ | ||||
|                PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 | ||||
| #else | ||||
|     GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ | ||||
|                PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 | ||||
| #endif | ||||
|     GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \ | ||||
|                 $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10 | ||||
|     PTR_ADDI  I,      I,      -1 | ||||
|     PTR_ADD   X,      X,      INC_X | ||||
|     bnez      I,      .L_\XW\()_M_L1 | ||||
| .L_\XW\()_M_END: | ||||
|     fld.d   $f11,   Y,  0x00 | ||||
|     fldx.d  $f12,   Y,  INC_Y | ||||
|     PTR_ALSL    PY0, INC_Y,  Y,  1 | ||||
|     fld.d   $f13,   PY0,    0x00 | ||||
|     fldx.d  $f14,   PY0,    INC_Y | ||||
|     PTR_ALSL    PY1, INC_Y,  Y,  2 | ||||
|     fld.d   $f15,   PY1,    0x00 | ||||
|     fldx.d  $f16,   PY1,    INC_Y | ||||
|     PTR_ALSL    PY2, INC_Y,  PY1, 1 | ||||
|     fld.d   $f17,   PY2,    0x00 | ||||
|     fldx.d  $f18,   PY2,    INC_Y | ||||
| 
 | ||||
|     GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \ | ||||
|                 $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18 | ||||
| 
 | ||||
|     PTR_ADDI    J,      J,      -1 | ||||
| #if __loongarch_grlen == 64 | ||||
|     GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | ||||
|               PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | ||||
| #else | ||||
|     GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | ||||
|               PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | ||||
| #endif | ||||
|     fst.d   $f11,   Y,      0x00 | ||||
|     fstx.d  $f12,   Y,      INC_Y | ||||
|     fst.d   $f13,   PY0,    0x00 | ||||
|     fstx.d  $f14,   PY0,    INC_Y | ||||
|     fst.d   $f15,   PY1,    0x00 | ||||
|     fstx.d  $f16,   PY1,    INC_Y | ||||
|     fst.d   $f17,   PY2,    0x00 | ||||
|     fstx.d  $f18,   PY2,    INC_Y | ||||
|     PTR_ALSL    Y,      INC_Y,  Y,  3 | ||||
|     bnez        J,      .L_\XW\()_N_L8 | ||||
| .L_\XW\()_N_7: | ||||
|     andi        J,      N,      4 | ||||
|     beqz        J,      .L_\XW\()_N_3 | ||||
|     ZERO_Y4 | ||||
|     move        X,      X_ORG | ||||
|     PTR_SRLI    I,      M,       3 | ||||
|     beqz        I,      .L_\XW\()_N_4_M_7 | ||||
| .align 5
 | ||||
| .L_\XW\()_N_4_M_L8: | ||||
|     DLOAD_\X8 | ||||
|     DGEMV_T_4x8 | ||||
|     PTR_ADDI  I,      I,      -1 | ||||
|     PTR_ALSL  X,      INC_X,  X,  3 | ||||
|     bnez      I,      .L_\XW\()_N_4_M_L8 | ||||
| .L_\XW\()_N_4_M_7: | ||||
|     andi    I,      M,      4 | ||||
|     beqz    I,      .L_\XW\()_N_4_M_3 | ||||
|     DLOAD_\X4 | ||||
|     DGEMV_T_4x4 | ||||
|     PTR_ALSL  X,      INC_X,  X,  2 | ||||
| .L_\XW\()_N_4_M_3: | ||||
|     // Accumulated | ||||
|     GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 | ||||
|     andi        I,      M,      3 | ||||
|     beqz        I,      .L_\XW\()_N_4_M_END | ||||
| .align 5
 | ||||
| .L_\XW\()_N_4_M_L1: | ||||
|     fld.d   $f1,    X,      0x00 | ||||
|     GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00 | ||||
|     GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6 | ||||
|     PTR_ADDI  I,      I,      -1 | ||||
|     PTR_ADD   X,      X,      INC_X | ||||
|     bnez      I,      .L_\XW\()_N_4_M_L1 | ||||
| .L_\XW\()_N_4_M_END: | ||||
|     fld.d   $f11,   Y,  0x00 | ||||
|     fldx.d  $f12,   Y,  INC_Y | ||||
|     PTR_ALSL    PY0, INC_Y,  Y,  1 | ||||
|     fld.d   $f13,   PY0,    0x00 | ||||
|     fldx.d  $f14,   PY0,    INC_Y | ||||
| 
 | ||||
|     GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14 | ||||
| 
 | ||||
|     PTR_SLLI    K_LDA,  LDA,    2 | ||||
|     PTR_SUB     K_LDA,  K_LDA,  M8 | ||||
| 
 | ||||
| #if __loongarch_grlen == 64 | ||||
|     GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | ||||
| #else | ||||
|     GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | ||||
| #endif | ||||
|     fst.d   $f11,   Y,      0x00 | ||||
|     fstx.d  $f12,   Y,      INC_Y | ||||
|     fst.d   $f13,   PY0,    0x00 | ||||
|     fstx.d  $f14,   PY0,    INC_Y | ||||
|     PTR_ALSL    Y,      INC_Y,  Y,  2 | ||||
| .L_\XW\()_N_3: | ||||
|     andi        J,      N,      2 | ||||
|     beqz        J,      .L_\XW\()_N_1 | ||||
|     ZERO_Y2 | ||||
|     move        X,      X_ORG | ||||
|     PTR_SRLI    I,      M,       3 | ||||
|     beqz        I,      .L_\XW\()_N_2_M_7 | ||||
| .align 5
 | ||||
| .L_\XW\()_N_2_M_L8: | ||||
|     DLOAD_\X8 | ||||
|     DGEMV_T_2x8 | ||||
|     PTR_ADDI  I,      I,      -1 | ||||
|     PTR_ALSL  X,      INC_X,  X,  3 | ||||
|     bnez      I,      .L_\XW\()_N_2_M_L8 | ||||
| .L_\XW\()_N_2_M_7: | ||||
|     andi    I,      M,      4 | ||||
|     beqz    I,      .L_\XW\()_N_2_M_3 | ||||
|     DLOAD_\X4 | ||||
|     DGEMV_T_2x4 | ||||
|     PTR_ALSL  X,        INC_X,      X,      2 | ||||
| .L_\XW\()_N_2_M_3: | ||||
|     // Accumulated | ||||
|     GACC xvf, d, Y0, TP0, Y1, TP1 | ||||
|     andi        I,      M,      3 | ||||
|     beqz        I,      .L_\XW\()_N_2_M_END | ||||
| .align 5
 | ||||
| .L_\XW\()_N_2_M_L1: | ||||
|     fld.d   $f1,    X,      0x00 | ||||
|     GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00 | ||||
|     GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4 | ||||
|     PTR_ADDI  I,      I,      -1 | ||||
|     PTR_ADD   X,      X,      INC_X | ||||
|     bnez      I,      .L_\XW\()_N_2_M_L1 | ||||
| .L_\XW\()_N_2_M_END: | ||||
|     fld.d   $f11,   Y,  0x00 | ||||
|     fldx.d  $f12,   Y,  INC_Y | ||||
| 
 | ||||
|     GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12 | ||||
| 
 | ||||
|     PTR_SLLI    K_LDA,  LDA,    1 | ||||
|     PTR_SUB     K_LDA,  K_LDA,  M8 | ||||
| 
 | ||||
| #if __loongarch_grlen == 64 | ||||
|     GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | ||||
| #else | ||||
|     GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA | ||||
| #endif | ||||
|     fst.d   $f11,   Y,      0x00 | ||||
|     fstx.d  $f12,   Y,      INC_Y | ||||
|     PTR_ALSL        Y,      INC_Y,  Y,  1 | ||||
| .L_\XW\()_N_1: | ||||
|     andi    J,      N,      1 | ||||
|     beqz    J,      .L_END | ||||
|     ZERO_Y1 | ||||
|     move    X,      X_ORG | ||||
|     move    I,      M | ||||
|     beqz    I,      .L_END | ||||
| .align 5
 | ||||
| .L_\XW\()_N_1_M_L1: | ||||
|     fld.d   $f3,    PA0,    0x00 | ||||
|     fld.d   $f1,    X,      0x00 | ||||
|     fmadd.d $f19,   $f3,    $f1,    $f19 | ||||
|     PTR_ADDI  I,      I,      -1 | ||||
|     PTR_ADD   X,      X,      INC_X | ||||
|     PTR_ADDI  PA0,    PA0,    0x08 | ||||
|     bnez      I,      .L_\XW\()_N_1_M_L1 | ||||
|     fld.d     $f3,    Y,      0x00 | ||||
|     fmadd.d   $f3,    ALPHA,  $f19,  $f3 | ||||
|     fst.d     $f3,    Y,      0x00 | ||||
|     b .L_END | ||||
| .endm | ||||
| 
 | ||||
|     PROLOGUE | ||||
|     PTR_LD     INC_Y,  $sp,    0 | ||||
|     push_if_used 17 + 8, 24 + 3 | ||||
|     PTR_ADDI   K,      $r0,     0x01 | ||||
|     PTR_SUB    I,      INC_X,   K | ||||
|     maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
 | ||||
|     GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | ||||
|     xvreplve0.d     VALPHA, $xr0 | ||||
|     move     X_ORG,  X | ||||
|     move     PA0,    A | ||||
| #if __loongarch_grlen == 64 | ||||
|     GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | ||||
|               PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | ||||
| #else | ||||
|     GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | ||||
|               PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | ||||
| #endif | ||||
|     la.local    T0,     .L_GAP_TABLE | ||||
|     PTR_ALSL    I,      I,      T0,     1 | ||||
|     ld.h        K,      I,      0 | ||||
|     PTR_ADD     T0,     T0,     K | ||||
|     jirl        $r0,    T0,     0 | ||||
| .L_GAP_TABLE: | ||||
|     .hword .L_GAP_0 - .L_GAP_TABLE | ||||
|     .hword .L_GAP_1 - .L_GAP_TABLE | ||||
| .L_GAP_0: /* if (incx == 1) */ | ||||
|     DGEMV_T GAP_0, X8, X4 | ||||
| .L_GAP_1: /* if (incx != 1) */ | ||||
|     DGEMV_T GAP_1, X8_GAP, X4_GAP | ||||
| .L_END: | ||||
|     pop_if_used 17 + 8, 24 + 3 | ||||
|     jirl    $r0, $r1, 0x0 | ||||
|     EPILOGUE | ||||
|  | @ -0,0 +1,313 @@ | |||
| /******************************************************************************* | ||||
| Copyright (c) 2023, The OpenBLAS Project | ||||
| All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | ||||
| met: | ||||
| 1. Redistributions of source code must retain the above copyright | ||||
| notice, this list of conditions and the following disclaimer. | ||||
| 2. Redistributions in binary form must reproduce the above copyright | ||||
| notice, this list of conditions and the following disclaimer in | ||||
| the documentation and/or other materials provided with the | ||||
| distribution. | ||||
| 3. Neither the name of the OpenBLAS project nor the names of | ||||
| its contributors may be used to endorse or promote products | ||||
| derived from this software without specific prior written permission. | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | ||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *******************************************************************************/ | ||||
| 
 | ||||
| #if __loongarch_grlen == 64 | ||||
| #define LA_REG    int64_t | ||||
| #define REG_SIZE  8 | ||||
| #define REG_LOG   3 | ||||
| #define PTR_ADDI  addi.d | ||||
| #define PTR_ADD   add.d | ||||
| #define PTR_SUB   sub.d | ||||
| #define PTR_LD    ld.d | ||||
| #define PTR_ST    st.d | ||||
| #define PTR_SLLI  slli.d | ||||
| #define PTR_SRLI  srli.d | ||||
| #define PTR_ALSL  alsl.d | ||||
| #else | ||||
| #define LA_REG    int32_t | ||||
| #define REG_SIZE  4 | ||||
| #define REG_LOG   2 | ||||
| #define PTR_ADDI  addi.w | ||||
| #define PTR_ADD   add.w | ||||
| #define PTR_SUB   sub.w | ||||
| #define PTR_LD    ld.w | ||||
| #define PTR_ST    st.w | ||||
| #define PTR_SLLI  slli.w | ||||
| #define PTR_SRLI  srli.w | ||||
| #define PTR_ALSL  alsl.w | ||||
| #endif | ||||
| 
 | ||||
| #if __loongarch_frlen == 64 | ||||
| #define FREG_SIZE 8 | ||||
| #define FREG_LOG  3 | ||||
| #define PTR_FLD   fld.d | ||||
| #define PTR_FST   fst.d | ||||
| #else | ||||
| #define FREG_SIZE 4 | ||||
| #define FREG_LOG  2 | ||||
| #define PTR_FLD   fld.s | ||||
| #define PTR_FST   fst.s | ||||
| #endif | ||||
| 
 | ||||
| // The max registers available to the user which | ||||
| // do not need to be preserved across calls. | ||||
| // Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html | ||||
| #define MAX_INT_CALLER_SAVED 17 | ||||
| #define MAX_FP_CALLER_SAVED  24 | ||||
| 
 | ||||
| .altmacro // Enable alternate macro mode | ||||
| 
 | ||||
| .macro push_if_used regs, fregs | ||||
| .if \regs > MAX_INT_CALLER_SAVED | ||||
|     PTR_ADDI      $sp,    $sp,    -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) | ||||
|     push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 | ||||
| .endif | ||||
| .if \fregs > MAX_FP_CALLER_SAVED | ||||
|     PTR_ADDI      $sp,    $sp,    -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) | ||||
|     push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 | ||||
| .endif | ||||
| .endm // End push_if_used | ||||
| .macro pop_if_used regs, fregs | ||||
| .if \fregs > MAX_FP_CALLER_SAVED | ||||
|     pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 | ||||
|     PTR_ADDI      $sp,    $sp,    (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG | ||||
| .endif | ||||
| .if \regs > MAX_INT_CALLER_SAVED | ||||
|     pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 | ||||
|     PTR_ADDI      $sp,    $sp,    (\regs - MAX_INT_CALLER_SAVED) << REG_LOG | ||||
| .endif | ||||
| .endm // End pop_if_used | ||||
| .macro push_regs from, to | ||||
|     PTR_ST    $s\()\from,     $sp,    \from << REG_LOG | ||||
| .if \to - \from | ||||
|     push_regs %from + 1, \to | ||||
| .endif | ||||
| .endm // End push_regs | ||||
| .macro pop_regs from, to | ||||
|     PTR_LD    $s\()\from,     $sp,    \from << REG_LOG | ||||
| .if \to - \from | ||||
|     pop_regs %from + 1, \to | ||||
| .endif | ||||
| .endm // End pop_regs | ||||
| .macro push_fregs from, to | ||||
|     PTR_FST   $fs\()\from,    $sp,    \from << FREG_LOG | ||||
| .if \to - \from | ||||
|     push_fregs %from + 1, \to | ||||
| .endif | ||||
| .endm // End push_fregs | ||||
| .macro pop_fregs from, to | ||||
|     PTR_FLD   $fs\()\from,    $sp,    \from << FREG_LOG | ||||
| .if \to - \from | ||||
|     pop_fregs %from + 1, \to | ||||
| .endif | ||||
| .endm // End pop_fregs | ||||
| 
 | ||||
| // | ||||
| // Instruction Related Macros | ||||
| // | ||||
| // GLD | ||||
| // | ||||
| .macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg | ||||
| .ifeqs "\suf_op", "0" | ||||
|     \pre_op\()ld    \out,   \src,   \offset | ||||
| .else | ||||
|     \pre_op\()ld.\suf_op    \out,   \src,   \offset | ||||
| .endif | ||||
| .ifnb \more | ||||
|     GLD \pre_op, \suf_op, \more | ||||
| .endif | ||||
| .endm | ||||
| 
 | ||||
| // | ||||
| // GLD_INC | ||||
| // | ||||
| .macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg | ||||
| .ifeqs "\suf_op", "0" | ||||
|     \pre_op\()ld    \out,   \src,   \offset | ||||
| .else | ||||
|     \pre_op\()ld.\suf_op    \out,   \src,   \offset | ||||
| .endif | ||||
|     PTR_ADDI  \src,   \src,   \inc | ||||
| .ifnb \more | ||||
|     GLD_INC \pre_op, \suf_op, \inc, \more | ||||
| .endif | ||||
| .endm | ||||
| // | ||||
| // GLDX is same as GLD except the stride is a register | ||||
| // | ||||
| .macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg | ||||
| .ifeqs "\suf_op", "0" | ||||
|     \pre_op\()ldx    \out,   \src,   \offset | ||||
| .else | ||||
|     \pre_op\()ldx.\suf_op    \out,   \src,   \offset | ||||
| .endif | ||||
| .ifnb \more | ||||
|     GLDX \pre_op, \suf_op, \more | ||||
| .endif | ||||
| .endm | ||||
| // | ||||
| // GLDREPL | ||||
| // | ||||
| .macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg | ||||
|     \pre_op\()ldrepl.\suf_op    \out,   \src,   \offset | ||||
| .ifnb \more | ||||
|     GLDREPL  \pre_op, \suf_op, \more | ||||
| .endif | ||||
| .endm | ||||
| // | ||||
| // GST | ||||
| // | ||||
| .macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg | ||||
| .ifeqs "\suf_op", "0" | ||||
|     \pre_op\()st    \src,   \dst,   \offset | ||||
| .else | ||||
|     \pre_op\()st.\suf_op \src,  \dst, \offset | ||||
| .endif | ||||
| .ifnb \more | ||||
|     GST \pre_op, \suf_op, \more | ||||
| .endif | ||||
| .endm | ||||
| // | ||||
| // GMUL | ||||
| // | ||||
| .macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | ||||
|     \pre_op\()mul.\suf_op   \out,   \in0,   \in1 | ||||
| .ifnb \more | ||||
|     GMUL \pre_op, \suf_op, \more | ||||
| .endif | ||||
| .endm | ||||
| // | ||||
| // GMADD | ||||
| // | ||||
| .macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg | ||||
|     \pre_op\()madd.\suf_op \out, \in0, \in1, \in2 | ||||
| .ifnb \more | ||||
|     GMADD \pre_op, \suf_op, \more | ||||
| .endif | ||||
| .endm | ||||
| // | ||||
| // GADD | ||||
| // | ||||
| .macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | ||||
|     \pre_op\()add.\suf_op \out, \in0, \in1 | ||||
| .ifnb \more | ||||
|     GADD \pre_op, \suf_op, \more | ||||
| .endif | ||||
| .endm | ||||
| // | ||||
| // GADDI | ||||
| // | ||||
| .macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | ||||
|     \pre_op\()addi.\suf_op  \out,   \in0,   \in1 | ||||
| .ifnb \more | ||||
|     GADDI \pre_op, \suf_op, \more | ||||
| .endif | ||||
| .endm | ||||
| // | ||||
| // GSLLI | ||||
| // | ||||
| .macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | ||||
|     \pre_op\()slli.\suf_op  \out,   \in0,   \in1 | ||||
| .ifnb \more | ||||
|     GSLLI \pre_op, \suf_op, \more | ||||
| .endif | ||||
| .endm | ||||
| // | ||||
| // GINSVE0 | ||||
| // | ||||
| .macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | ||||
|     \pre_op\()insve0.\suf_op    \out,   \in0,   \in1 | ||||
| .ifnb \more | ||||
|     GINSVE0 \pre_op, \suf_op, \more | ||||
| .endif | ||||
| .endm | ||||
| // | ||||
| // GXOR | ||||
| // | ||||
| .macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | ||||
|     \pre_op\()xor.\suf_op    \out,   \in0,   \in1 | ||||
| .ifnb \more | ||||
|     GXOR \pre_op, \suf_op, \more | ||||
| .endif | ||||
| .endm | ||||
| 
 | ||||
| // | ||||
| // Compound instructions | ||||
| // | ||||
| // GACC: Accumulate the values of vector registers | ||||
| // | ||||
| .macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg | ||||
| .ifeqs "\pre_op", "xvf" | ||||
|     xvpermi.q              \out,   \in,    0x01 | ||||
|     \pre_op\()add.\suf_op  \in,    \out,   \in | ||||
|     xvpackod.d             \out,   \in,    \in | ||||
|     \pre_op\()add.\suf_op  \out,   \out,   \in | ||||
| .ifeqs "\suf_op", "s" | ||||
|     xvpackod.w             \in,    \out,   \out | ||||
|     \pre_op\()add.\suf_op  \out,   \out,   \in | ||||
| .endif | ||||
| .endif | ||||
| 
 | ||||
| .ifeqs "\pre_op", "vf" | ||||
|     vpackod.d              \out,   \in,    \in | ||||
|     \pre_op\()add.\suf_op  \out,   \out,   \in | ||||
| .ifeqs "\suf_op", "s" | ||||
|     vpackod.w              \in,    \out,   \out | ||||
|     \pre_op\()add.\suf_op  \out,   \out,   \in | ||||
| .endif | ||||
| .endif | ||||
| 
 | ||||
| .ifeqs "\pre_op", "xv" | ||||
|     xvpermi.q              \out,   \in,    0x01 | ||||
|     \pre_op\()add.\suf_op  \in,    \out,   \in | ||||
|     xvpackod.d             \out,   \in,    \in | ||||
|     \pre_op\()add.\suf_op  \out,   \out,   \in | ||||
| .ifnc "\suf_op", "d" | ||||
|     xvpackod.w             \in,    \out,   \out | ||||
|     \pre_op\()add.\suf_op  \out,   \out,   \in | ||||
| .ifnc "\suf_op", "w" | ||||
|     xvpackod.h             \in,    \out,   \out | ||||
|     \pre_op\()add.\suf_op  \out,   \out,   \in | ||||
| .ifnc "\suf_op", "h" | ||||
|     xvpackod.b             \in,    \out,   \out | ||||
|     \pre_op\()add.\suf_op  \out,   \out,   \in | ||||
| .endif | ||||
| .endif | ||||
| .endif | ||||
| .endif | ||||
| 
 | ||||
| .ifeqs "\pre_op", "v" | ||||
|     vpackod.d              \out,   \in,    \in | ||||
|     \pre_op\()add.\suf_op  \out,   \out,   \in | ||||
| .ifnc "\suf_op", "d" | ||||
|     vpackod.w              \in,    \out,   \out | ||||
|     \pre_op\()add.\suf_op  \out,   \out,   \in | ||||
| .ifnc "\suf_op", "w" | ||||
|     vpackod.h              \in,    \out,   \out | ||||
|     \pre_op\()add.\suf_op  \out,   \out,   \in | ||||
| .ifnc "\suf_op", "h" | ||||
|     vpackod.b              \in,    \out,   \out | ||||
|     \pre_op\()add.\suf_op  \out,   \out,   \in | ||||
| .endif | ||||
| .endif | ||||
| .endif | ||||
| .endif | ||||
| 
 | ||||
| .ifnb \more | ||||
|     GACC \pre_op, \suf_op, \more | ||||
| .endif | ||||
| .endm | ||||
		Loading…
	
		Reference in New Issue