loongarch: Add optimization for dsdot kernel.
This commit is contained in:
		
							parent
							
								
									3def6a8143
								
							
						
					
					
						commit
						13b8c44b44
					
				| 
						 | 
				
			
			@ -1,7 +1,8 @@
 | 
			
		|||
ifndef NO_LASX
 | 
			
		||||
 | 
			
		||||
SDOTKERNEL = dot_lasx.S
 | 
			
		||||
DDOTKERNEL = dot_lasx.S
 | 
			
		||||
SDOTKERNEL  = dot_lasx.S
 | 
			
		||||
DSDOTKERNEL = dot_lasx.S
 | 
			
		||||
DDOTKERNEL  = dot_lasx.S
 | 
			
		||||
 | 
			
		||||
DGEMMKERNEL    = dgemm_kernel_16x4.S
 | 
			
		||||
DGEMMINCOPY    = dgemm_ncopy_16.S
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -51,6 +51,8 @@ PROLOGUE
 | 
			
		|||
    LDINT     INCX,   0(INCX)
 | 
			
		||||
    LDINT     INCY,   0(INCY)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    /* init $f8 and $f9 to zero */
 | 
			
		||||
    SUB       s1,     s1,      s1
 | 
			
		||||
    SUB       s2,     s2,      s2
 | 
			
		||||
    slli.d    INCX,   INCX,    BASE_SHIFT
 | 
			
		||||
| 
						 | 
				
			
			@ -59,11 +61,8 @@ PROLOGUE
 | 
			
		|||
    bge       $r0,    N,       .L999
 | 
			
		||||
    bne       INCX,   TEMP,    .L20   /* inc_x=1 */
 | 
			
		||||
    bne       INCY,   TEMP,    .L20   /* inc_y=1 */
 | 
			
		||||
#ifdef DOUBLE
 | 
			
		||||
    srai.d    I,      N,       4
 | 
			
		||||
#else
 | 
			
		||||
    srai.d    I,      N,       5
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    /* !((inc_x == 1) && (inc_y == 1)) */
 | 
			
		||||
 | 
			
		||||
    /* init $xr8 and $xr9 to zero */
 | 
			
		||||
#ifdef DOUBLE
 | 
			
		||||
| 
						 | 
				
			
			@ -71,13 +70,24 @@ PROLOGUE
 | 
			
		|||
#else
 | 
			
		||||
    xvldrepl.w $xr0,  X,       0
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef DSDOT
 | 
			
		||||
    xvfcvtl.d.s       $xr0,    $xr0
 | 
			
		||||
    xvfsub.d  $xr8,   $xr0,    $xr0
 | 
			
		||||
    xvfsub.d  $xr9,   $xr0,    $xr0
 | 
			
		||||
#else
 | 
			
		||||
    XVFSUB    $xr8,   $xr0,    $xr0
 | 
			
		||||
    XVFSUB    $xr9,   $xr0,    $xr0
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
/* !((inc_x == 1) && (inc_y == 1)) */
 | 
			
		||||
    bge       $r0,    I,       .L12   /* <32 */
 | 
			
		||||
#ifdef DOUBLE
 | 
			
		||||
    srai.d    I,      N,       4
 | 
			
		||||
#else
 | 
			
		||||
    srai.d    I,      N,       5
 | 
			
		||||
#endif
 | 
			
		||||
    bge       $r0,    I,       .L12   /* FLOAT: <32 ; DOUBLE: <16 */
 | 
			
		||||
    .align  3
 | 
			
		||||
.L11:
 | 
			
		||||
    /* case 32~ */
 | 
			
		||||
    /* FLOAT: 32~ ; DOUBLE: 16~ */
 | 
			
		||||
    xvld      $xr0,   X,       0
 | 
			
		||||
    xvld      $xr1,   X,       32
 | 
			
		||||
    xvld      $xr2,   X,       64
 | 
			
		||||
| 
						 | 
				
			
			@ -89,11 +99,39 @@ PROLOGUE
 | 
			
		|||
    addi.w    I,      I,       -1
 | 
			
		||||
    addi.d    X,      X,       128
 | 
			
		||||
    addi.d    Y,      Y,       128
 | 
			
		||||
#ifdef DSDOT
 | 
			
		||||
    xvfcvtl.d.s       $xr10,   $xr0
 | 
			
		||||
    xvfcvtl.d.s       $xr11,   $xr4
 | 
			
		||||
    xvfcvth.d.s       $xr12,   $xr0
 | 
			
		||||
    xvfcvth.d.s       $xr13,   $xr4
 | 
			
		||||
    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
 | 
			
		||||
    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
 | 
			
		||||
    xvfcvtl.d.s       $xr10,   $xr1
 | 
			
		||||
    xvfcvtl.d.s       $xr11,   $xr5
 | 
			
		||||
    xvfcvth.d.s       $xr12,   $xr1
 | 
			
		||||
    xvfcvth.d.s       $xr13,   $xr5
 | 
			
		||||
    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
 | 
			
		||||
    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
 | 
			
		||||
    xvfcvtl.d.s       $xr10,   $xr2
 | 
			
		||||
    xvfcvtl.d.s       $xr11,   $xr6
 | 
			
		||||
    xvfcvth.d.s       $xr12,   $xr2
 | 
			
		||||
    xvfcvth.d.s       $xr13,   $xr6
 | 
			
		||||
    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
 | 
			
		||||
    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
 | 
			
		||||
    xvfcvtl.d.s       $xr10,   $xr3
 | 
			
		||||
    xvfcvtl.d.s       $xr11,   $xr7
 | 
			
		||||
    xvfcvth.d.s       $xr12,   $xr3
 | 
			
		||||
    xvfcvth.d.s       $xr13,   $xr7
 | 
			
		||||
    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
 | 
			
		||||
    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
 | 
			
		||||
#else
 | 
			
		||||
    XVFMADD   $xr8,   $xr0,    $xr4,   $xr8
 | 
			
		||||
    XVFMADD   $xr9,   $xr1,    $xr5,   $xr9
 | 
			
		||||
    XVFMADD   $xr8,   $xr2,    $xr6,   $xr8
 | 
			
		||||
    XVFMADD   $xr9,   $xr3,    $xr7,   $xr9
 | 
			
		||||
#endif
 | 
			
		||||
    bnez      I,      .L11
 | 
			
		||||
    .align  3
 | 
			
		||||
.L12:
 | 
			
		||||
#ifdef DOUBLE
 | 
			
		||||
    andi      I,      N,       0xf
 | 
			
		||||
| 
						 | 
				
			
			@ -102,18 +140,37 @@ PROLOGUE
 | 
			
		|||
    andi      I,      N,       0x1f
 | 
			
		||||
    srai.d    I,      I,       3
 | 
			
		||||
#endif
 | 
			
		||||
    bge       $r0,    I,       .L14   /* <8 */
 | 
			
		||||
    bge       $r0,    I,       .L14   /* DOUBLE: <4 ; FLOAT: <8 */
 | 
			
		||||
    .align  3
 | 
			
		||||
.L13:
 | 
			
		||||
    /* case 8~31 */
 | 
			
		||||
    /* FLOAT: 8~31 ; DOUBLE: 4~15 */
 | 
			
		||||
    xvld      $xr0,   X,       0
 | 
			
		||||
    xvld      $xr4,   Y,       0
 | 
			
		||||
    addi.w    I,      I,       -1
 | 
			
		||||
    addi.d    X,      X,       32
 | 
			
		||||
    addi.d    Y,      Y,       32
 | 
			
		||||
#ifdef DSDOT
 | 
			
		||||
    xvfcvtl.d.s       $xr10,   $xr0
 | 
			
		||||
    xvfcvtl.d.s       $xr11,   $xr4
 | 
			
		||||
    xvfcvth.d.s       $xr12,   $xr0
 | 
			
		||||
    xvfcvth.d.s       $xr13,   $xr4
 | 
			
		||||
    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
 | 
			
		||||
    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
 | 
			
		||||
#else
 | 
			
		||||
    XVFMADD   $xr8,   $xr0,    $xr4,   $xr8
 | 
			
		||||
#endif
 | 
			
		||||
    bnez      I,      .L13
 | 
			
		||||
    .align  3
 | 
			
		||||
.L14:
 | 
			
		||||
    /* store dot in s1 $f8 */
 | 
			
		||||
#ifdef DSDOT
 | 
			
		||||
    xvfadd.d  $xr8,   $xr8,    $xr9
 | 
			
		||||
    fsub.s    s2,     s2,      s2,  /* set s2 to 0.0 */
 | 
			
		||||
    xvpermi.q $xr0,   $xr8,    0x1
 | 
			
		||||
    vfadd.d   $vr8,   $vr8,    $vr0
 | 
			
		||||
    vpackod.d $vr0,   $vr8,    $vr8
 | 
			
		||||
    vfadd.d   $vr8,   $vr8,    $vr0
 | 
			
		||||
#else
 | 
			
		||||
    XVFADD    $xr8,   $xr8,    $xr9
 | 
			
		||||
    SUB       s2,     s2,      s2   /* set s2 to 0.0 */
 | 
			
		||||
    xvpermi.q $xr0,   $xr8,    0x1
 | 
			
		||||
| 
						 | 
				
			
			@ -125,7 +182,9 @@ PROLOGUE
 | 
			
		|||
    VFADD     $vr8,   $vr8,    $vr0
 | 
			
		||||
    vpackod.w $vr0,   $vr8,    $vr8
 | 
			
		||||
    VFADD     $vr8,   $vr8,    $vr0
 | 
			
		||||
#endif
 | 
			
		||||
#endif /* defined DOUBLE */
 | 
			
		||||
#endif /* defined DSDOT */
 | 
			
		||||
    .align  3
 | 
			
		||||
.L15:
 | 
			
		||||
#ifdef DOUBLE
 | 
			
		||||
    andi      I,      N,       0x3
 | 
			
		||||
| 
						 | 
				
			
			@ -135,7 +194,7 @@ PROLOGUE
 | 
			
		|||
    bge       $r0,    I,       .L999  /* =0 */
 | 
			
		||||
   .align  3
 | 
			
		||||
.L16:
 | 
			
		||||
    /* case 1~7 */
 | 
			
		||||
    /* FLOAT: 1~7 ; DOUBLE: 1~3 */
 | 
			
		||||
    LD        a1,     X,       0
 | 
			
		||||
    LD        b1,     Y,       0
 | 
			
		||||
#ifdef DSDOT
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue