369 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C
		
	
	
	
			
		
		
	
	
			369 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C
		
	
	
	
| /***************************************************************************
 | |
| Copyright (c) 2017, The OpenBLAS Project
 | |
| All rights reserved.
 | |
| Redistribution and use in source and binary forms, with or without
 | |
| modification, are permitted provided that the following conditions are
 | |
| met:
 | |
| 1. Redistributions of source code must retain the above copyright
 | |
| notice, this list of conditions and the following disclaimer.
 | |
| 2. Redistributions in binary form must reproduce the above copyright
 | |
| notice, this list of conditions and the following disclaimer in
 | |
| the documentation and/or other materials provided with the
 | |
| distribution.
 | |
| 3. Neither the name of the OpenBLAS project nor the names of
 | |
| its contributors may be used to endorse or promote products
 | |
| derived from this software without specific prior written permission.
 | |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | |
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | |
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | |
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | |
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | |
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | |
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | |
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
| *****************************************************************************/
 | |
|  
 | |
| 
 | |
| #include "common.h"
 | |
| #include <math.h>
 | |
|  
 | |
| #define ABS fabs 
 | |
| #define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
 | |
| 
 | |
| 
 | |
| 
 | |
| #if defined(__VEC__) || defined(__ALTIVEC__)
 | |
|  
 | |
| /**
 | |
|  * Find  maximum index 
 | |
|  * Warning: requirements n>0  and n % 16 == 0
 | |
|  * @param n     
 | |
|  * @param x     pointer to the vector
 | |
|  * @param maxf  (out) maximum absolute value .( only for output )
 | |
|  * @return  index 
 | |
|  */
 | |
| static BLASLONG   ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { 
 | |
| 
 | |
|     BLASLONG index;
 | |
|     register __vector long long start = {1,0};
 | |
|     register __vector long long temp_add_index = {2, 2}; 
 | |
|    __asm__(
 | |
| 
 | |
|             "lxvd2x  44,      0,%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  45, %[i16],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  46, %[i32],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"  
 | |
|             "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"      
 | |
| 
 | |
|             "xxlor  40,%x[start],%x[start]  \n\t" //{ 1,0} vs40 | v8 
 | |
|             "vaddudm  9,8,%[adder]   \n\t" //{3,2} vs41 
 | |
|             "xxlxor  37,37 ,37       \n\t" //v5 v37 index_count
 | |
|             "vaddudm  10,9,%[adder]  \n\t" //{5,4} vs42
 | |
|             "xxlxor  38 ,38 ,38      \n\t"  // v6 | vs38 vec_max_index
 | |
|             "vaddudm  11,10,%[adder] \n\t" //{7,6} vs43
 | |
|             "xxlxor  39,39,39        \n\t" //   vs39 vec_max_value is zero
 | |
|             "vaddudm 4,11,  %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
 | |
|             XXSPLTD_S(36,36,0)
 | |
| 
 | |
| 
 | |
| 
 | |
|             "xvabsdp  44, 44 \n\t"
 | |
|             "xvabsdp  45, 45 \n\t" 
 | |
|             "xvabsdp  46, 46 \n\t"
 | |
|             "xvabsdp  47, 47 \n\t"   
 | |
|             "xvabsdp  48, 48 \n\t"
 | |
|             "xvabsdp  49, 49 \n\t"   
 | |
|             "xvabsdp  6, 6 \n\t"
 | |
|             "xvabsdp  7, 7 \n\t"
 | |
| 
 | |
|             //jump first half forward 
 | |
|             "b two%=  \n\t"
 | |
| 
 | |
|             ".align   5            \n\t"
 | |
|             "one%=: \n\t"
 | |
| 
 | |
|  
 | |
|             XXMRGHD_S(0,44,45)
 | |
|             XXMRGLD_S(1,44,45)
 | |
|             XXMRGHD_S(2,46,47)
 | |
|             XXMRGLD_S(3,46,47)
 | |
|             XXMRGHD_S(4,48,49)
 | |
|             XXMRGLD_S(5,48,49)
 | |
|             XXMRGHD_S(44,6,7)
 | |
|             XXMRGLD_S(45,6,7)
 | |
| 
 | |
|             "xvadddp    46,  0,1 \n\t"
 | |
|             "xvadddp    47,  2,3 \n\t" 
 | |
|             "xvadddp    48,  4,5 \n\t"
 | |
|             "xvadddp    49,  44,45 \n\t"
 | |
| 
 | |
| 
 | |
| 
 | |
|             "xvcmpgtdp  6,47,46  \n\t "
 | |
|             "xvcmpgtdp  7,49,48  \n\t "
 | |
| 
 | |
|             "addi     %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"   
 | |
| 
 | |
|             "xxsel    32,40,41,6 \n\t"
 | |
|             "xxsel     0,46,47,6 \n\t" 
 | |
|             "xxsel    33,42,43,7 \n\t"
 | |
|             "xxsel     1,48,49,7 \n\t"  
 | |
| 
 | |
|             "lxvd2x  44,      0,%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  45, %[i16],%[ptr_tmp] \n\t"
 | |
| 
 | |
|             "xvcmpgtdp  2,1,0  \n\t "             
 | |
|             "lxvd2x  46, %[i32],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t"
 | |
| 
 | |
| 
 | |
|             "xxsel    32,32,33,2 \n\t" 
 | |
|             "xxsel    3,0,1,2 \n\t" 
 | |
|      
 | |
|             "vaddudm   0,0,5       \n\t"
 | |
| 
 | |
|              //cmp with previous
 | |
| 
 | |
|             "xvcmpgtdp 4,3,39     \n\t "  
 | |
|             "vaddudm   5,5,4      \n\t"     
 | |
| 
 | |
|             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"  
 | |
|             "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"        
 | |
|              //select with previous 
 | |
|             "xxsel     38,38,32,4 \n\t" 
 | |
|             "xxsel    39,39,3,4    \n\t" 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|             "xvabsdp  44, 44 \n\t"
 | |
|             "xvabsdp  45, 45 \n\t" 
 | |
|             "xvabsdp  46, 46 \n\t"
 | |
|             "xvabsdp  47, 47 \n\t"   
 | |
|             "xvabsdp  48, 48 \n\t"
 | |
|             "xvabsdp  49, 49 \n\t"   
 | |
|             "xvabsdp  6, 6 \n\t"
 | |
|             "xvabsdp  7, 7 \n\t"
 | |
| 
 | |
| 
 | |
| //>>///////////////////////////////  half start
 | |
|             "two%=:   \n\t"
 | |
|             XXMRGHD_S(0,44,45)
 | |
|             XXMRGLD_S(1,44,45)
 | |
|             XXMRGHD_S(2,46,47)
 | |
|             XXMRGLD_S(3,46,47)
 | |
|             XXMRGHD_S(4,48,49)
 | |
|             XXMRGLD_S(5,48,49)
 | |
|             XXMRGHD_S(44,6,7)
 | |
|             XXMRGLD_S(45,6,7)
 | |
| 
 | |
|             "xvadddp    46,  0,1 \n\t"
 | |
|             "xvadddp    47,  2,3 \n\t" 
 | |
|             "xvadddp    48,  4,5 \n\t"
 | |
|             "xvadddp    49,  44,45 \n\t"
 | |
| 
 | |
|             "xvcmpgtdp  6,47,46  \n\t "
 | |
|             "xvcmpgtdp  7,49,48  \n\t "
 | |
| 
 | |
|             "addi     %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"   
 | |
| 
 | |
|             "xxsel    32,40,41,6 \n\t"
 | |
|             "xxsel     0,46,47,6 \n\t" 
 | |
|             "xxsel    33,42,43,7 \n\t"
 | |
|             "xxsel     1,48,49,7 \n\t"  
 | |
| 
 | |
|             "lxvd2x  44,      0,%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  45, %[i16],%[ptr_tmp] \n\t"
 | |
| 
 | |
|             "xvcmpgtdp  2,1,0  \n\t "             
 | |
|             "lxvd2x  46, %[i32],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t"
 | |
| 
 | |
| 
 | |
|             "xxsel    32,32,33,2 \n\t" 
 | |
|             "xxsel    3,0,1,2 \n\t" 
 | |
|      
 | |
|             "vaddudm   0,0,5       \n\t"
 | |
| 
 | |
|              //cmp with previous
 | |
| 
 | |
|             "xvcmpgtdp 4,3,39     \n\t "  
 | |
|             "vaddudm   5,5,4      \n\t"     
 | |
| 
 | |
|             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"  
 | |
|             "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"        
 | |
|              //select with previous 
 | |
|             "xxsel     38,38,32,4 \n\t" 
 | |
|             "xxsel    39,39,3,4    \n\t" 
 | |
|  
 | |
| 
 | |
|             "xvabsdp  44, 44 \n\t"
 | |
|             "xvabsdp  45, 45 \n\t" 
 | |
|             "xvabsdp  46, 46 \n\t"
 | |
|             "xvabsdp  47, 47 \n\t"   
 | |
|             "xvabsdp  48, 48 \n\t"
 | |
|             "xvabsdp  49, 49 \n\t"   
 | |
|             "xvabsdp  6, 6 \n\t"
 | |
|             "xvabsdp  7, 7 \n\t"
 | |
| 
 | |
| 
 | |
|             //decrement n
 | |
|             "addic.    %[n], %[n], -16 \n\t"
 | |
|             //Loop back if >0
 | |
|             "bgt+ one%=  \n\t"
 | |
| 
 | |
| 
 | |
|             XXMRGHD_S(0,44,45)
 | |
|             XXMRGLD_S(1,44,45)
 | |
|             XXMRGHD_S(2,46,47)
 | |
|             XXMRGLD_S(3,46,47)
 | |
|             XXMRGHD_S(4,48,49)
 | |
|             XXMRGLD_S(5,48,49)
 | |
|             XXMRGHD_S(44,6,7)
 | |
|             XXMRGLD_S(45,6,7)
 | |
| 
 | |
|             "xvadddp    46,  0,1 \n\t"
 | |
|             "xvadddp    47,  2,3 \n\t" 
 | |
|             "xvadddp    48,  4,5 \n\t"
 | |
|             "xvadddp    49,  44,45 \n\t"
 | |
| 
 | |
| 
 | |
| 
 | |
|             "xvcmpgtdp  6,47,46  \n\t "
 | |
|             "xvcmpgtdp  7,49,48  \n\t "
 | |
| 
 | |
|             "xxsel    32,40,41,6 \n\t"
 | |
|             "xxsel     0,46,47,6 \n\t" 
 | |
|             "xxsel    33,42,43,7 \n\t"
 | |
|             "xxsel     1,48,49,7 \n\t"  
 | |
| 
 | |
|             "xvcmpgtdp  2,1,0  \n\t " 
 | |
|             "xxsel    32,32,33,2 \n\t" 
 | |
|             "xxsel    3,0,1,2 \n\t" 
 | |
|      
 | |
|             "vaddudm   0,0,5       \n\t"
 | |
| 
 | |
|             "addi     %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"            
 | |
|              //cmp with previous
 | |
| 
 | |
|             "xvcmpgtdp 4,3,39     \n\t "  
 | |
|             "vaddudm   5,5,4      \n\t"     
 | |
|             "xxsel     38,38,32,4 \n\t" 
 | |
|             "xxsel    39,39,3,4    \n\t" 
 | |
| 
 | |
| 
 | |
|             ///////extract max value and max index from vector
 | |
| 
 | |
|             XXSPLTD_S(32,38,1)
 | |
|             XXSPLTD_S(40,39,1)
 | |
|             "xvcmpeqdp.  2, 40,39  \n\t"
 | |
|     
 | |
|             //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
 | |
|              //0b001110=14
 | |
|             "bc 14,24, three%=  \n\t" 
 | |
|             "xvcmpgtdp  4, 40,39  \n\t"
 | |
|             "xxsel    0,39,40,4           \n\t"
 | |
|             "xxsel    1,38,32,4  \n\t"
 | |
|             "stxsdx    0,0,%[ptr_maxf]     \n\t" 
 | |
|             "b four%=    \n\t"
 | |
| 
 | |
|             "three%=:      \n\t" 
 | |
|                 //if elements value are equal then choose minimum index
 | |
|             XXSPLTD_S(0,40,0)
 | |
|             "vminud   0,0,6    \n\t"  //vs32 vs38
 | |
|             "xxlor 1,32,32     \n\t"
 | |
|             "stxsdx   0,0,%[ptr_maxf]  \n\t"
 | |
|           
 | |
| 
 | |
|             "four%=:      \n\t"
 | |
|             "mfvsrd   %[index],1 \n\t"
 | |
| 
 | |
|             : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
 | |
|             : [mem] "m"(*(const double (*)[2*n])x), [ptr_x] "b"(x), [ptr_maxf] "b"(maxf) ,
 | |
|             [i16] "b"(16), [i32] "b"(32), [i48] "b"(48),
 | |
|             [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
 | |
|             [start] "v"(start),  [adder] "v"(temp_add_index)
 | |
|             : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
 | |
|             "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
 | |
|             );
 | |
|  
 | |
|     return index;
 | |
| 
 | |
| }
 | |
|  
 | |
| #endif  
 | |
| 
 | |
|  
 | |
|  
 | |
| 
 | |
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 | |
| {
 | |
|     BLASLONG i = 0;
 | |
|     BLASLONG ix = 0;
 | |
|     FLOAT maxf = 0;
 | |
|     BLASLONG max = 0;
 | |
|     BLASLONG inc_x2;
 | |
| 
 | |
|     if (n <= 0 || inc_x <= 0) return(max);
 | |
|      
 | |
|     if (inc_x == 1) {
 | |
| 
 | |
| #if defined(_CALL_ELF) && (_CALL_ELF == 2)
 | |
| #if defined(__VEC__) || defined(__ALTIVEC__)
 | |
| 
 | |
|       BLASLONG n1 = n & -16;
 | |
|       if (n1 > 0) {
 | |
| 
 | |
|             max = ziamax_kernel_16(n1, x, &maxf); 
 | |
|             i = n1;
 | |
|             ix = n1 << 1;
 | |
|       }
 | |
| #endif
 | |
| #endif
 | |
| 
 | |
|       while(i < n)
 | |
|     {
 | |
|         if( CABS1(x,ix) > maxf )
 | |
|         {
 | |
|             max = i;
 | |
|             maxf = CABS1(x,ix);
 | |
|         }
 | |
|         ix += 2;
 | |
|         i++;
 | |
|     }
 | |
|         return (max + 1);
 | |
| 
 | |
|     } else {
 | |
|  
 | |
|       inc_x2 = 2 * inc_x;
 | |
| 
 | |
|     maxf = CABS1(x,0);
 | |
|     ix += inc_x2;
 | |
|     i++;
 | |
| 
 | |
|     while(i < n)
 | |
|     {
 | |
|         if( CABS1(x,ix) > maxf )
 | |
|         {
 | |
|             max = i;
 | |
|             maxf = CABS1(x,ix);
 | |
|         }
 | |
|         ix += inc_x2;
 | |
|         i++;
 | |
|     }
 | |
|         return (max + 1);
 | |
|     }
 | |
|  
 | |
| }
 | |
| 
 | |
| 
 |