385 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C
		
	
	
	
			
		
		
	
	
			385 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C
		
	
	
	
| /***************************************************************************
 | |
| Copyright (c) 2013-2018, The OpenBLAS Project
 | |
| All rights reserved.
 | |
| Redistribution and use in source and binary forms, with or without
 | |
| modification, are permitted provided that the following conditions are
 | |
| met:
 | |
| 1. Redistributions of source code must retain the above copyright
 | |
| notice, this list of conditions and the following disclaimer.
 | |
| 2. Redistributions in binary form must reproduce the above copyright
 | |
| notice, this list of conditions and the following disclaimer in
 | |
| the documentation and/or other materials provided with the
 | |
| distribution.
 | |
| 3. Neither the name of the OpenBLAS project nor the names of
 | |
| its contributors may be used to endorse or promote products
 | |
| derived from this software without specific prior written permission.
 | |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | |
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | |
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | |
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | |
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | |
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | |
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | |
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
|  *****************************************************************************/
 | |
| #include "common.h"
 | |
| #include <math.h>
 | |
| 
 | |
| #if defined(DOUBLE)
 | |
| 
 | |
| #define ABS fabs
 | |
| 
 | |
| #else
 | |
| 
 | |
| #define ABS fabsf
 | |
| 
 | |
| #endif
 | |
| 
 | |
| /**
 | |
|  * Find  minimum index 
 | |
|  * Warning: requirements n>0  and n % 32 == 0
 | |
|  * @param n     
 | |
|  * @param x     pointer to the vector
 | |
|  * @param minf  (out) minimum absolute value .( only for output )
 | |
|  * @return minimum index 
 | |
|  */
 | |
| static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
 | |
|     BLASLONG index;
 | |
|     register __vector long long start = {1,0};
 | |
|     register __vector long long temp_add_index = {2, 2}; 
 | |
|     __asm__(
 | |
|  
 | |
| 
 | |
|             "lxvd2x  44,      0,%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  45, %[i16],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  46, %[i32],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t" 
 | |
|             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"
 | |
| 
 | |
|             "xxlor  40,%x[start],%x[start]  \n\t" //{ 1,0} vs40 | v8 
 | |
|             "vaddudm  9,8, %[adder]   \n\t" //{3,2} vs41 
 | |
|             "xxlxor  37,37 ,37        \n\t" //v5 v37 index_count
 | |
|             "vaddudm  10,9,%[adder]   \n\t" //{5,4} vs42
 | |
|             "xxlxor  38 ,38 ,38       \n\t"  // v6 | vs38 vec_min_index
 | |
|             "vaddudm  11,10,%[adder]  \n\t" //{7,6} vs43
 | |
|             "lxvdsx  39,0,%[ptr_minf] \n\t" //   vs39 vec_min_value
 | |
|             "vaddudm  4,11, %[adder]  \n\t" // {9,8} -{8;8} vs36 | v4
 | |
|             "xxspltd   36,36,0        \n\t"
 | |
|             "xvabsdp  39, 39 \n\t"             
 | |
|     
 | |
|             "xvabsdp  44, 44 \n\t"
 | |
|             "xvabsdp  45, 45 \n\t"
 | |
|             "xvabsdp  46, 46 \n\t"
 | |
|             "xvabsdp  47, 47 \n\t"                         
 | |
|             "xvabsdp  48, 48 \n\t"
 | |
|             "xvabsdp  49, 49 \n\t"
 | |
|             "xvabsdp  50, 50 \n\t"
 | |
|             "xvabsdp  51, 51 \n\t"
 | |
|   
 | |
|             //jump first half forward 
 | |
|             "b 2f  \n\t"
 | |
| 
 | |
| //===================================================================
 | |
| 
 | |
|             ".p2align   5            \n\t"
 | |
| 
 | |
|             "1: \n\t"
 | |
|             "xvcmpgtdp  2,44,45  \n\t "
 | |
|             "xvcmpgtdp  3,46,47  \n\t "
 | |
|             "xvcmpgtdp  4,48,49  \n\t "
 | |
|             "xvcmpgtdp  5,50,51  \n\t"
 | |
| 
 | |
|             "xxsel    32,40,41,2 \n\t"
 | |
|             "xxsel     0,44,45,2 \n\t" 
 | |
|             "xxsel    33,42,43,3 \n\t"
 | |
|             "xxsel     1,46,47,3 \n\t" 
 | |
|             "xxsel    34,40,41,4 \n\t"
 | |
|             "xxsel    45,48,49,4 \n\t"
 | |
|             "xxsel    35,42,43,5 \n\t"
 | |
|             "xxsel    47,50,51,5 \n\t"
 | |
| 
 | |
|             "xvcmpgtdp 2,0, 1     \n\t"
 | |
|             "xvcmpgtdp 3, 45,47   \n\t"
 | |
| 
 | |
|             "addi     %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" 
 | |
|     
 | |
|             "xxsel     32,32,33,2 \n\t"
 | |
|             "xxsel       0 ,0,1,2 \n\t"
 | |
|             "xxsel     34,34,35,3 \n\t"
 | |
|             "xxsel     5,45,47,3 \n\t"
 | |
|  
 | |
|           
 | |
|             //load next 64
 | |
|             "lxvd2x  44,      0,%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  45, %[i16],%[ptr_tmp] \n\t"
 | |
|     
 | |
|             // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
 | |
|             "vaddudm     2,2,4  \n\t" // vs34=vs34 + vs36{8,8} 
 | |
|     
 | |
|             "lxvd2x  46, %[i32],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t" 
 | |
| 
 | |
|             //choose smaller from first and second part
 | |
|             "xvcmpgtdp 4, 0,5     \n\t" 
 | |
|             "xxsel     3, 0,5,4  \n\t"
 | |
|             "xxsel     33,32,34,4  \n\t"
 | |
| 
 | |
|             //load next 64
 | |
|             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"
 | |
| 
 | |
|             "vaddudm  1,1,5  \n\t"  //  get real index for first smaller  
 | |
| 
 | |
|             "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"
 | |
| 
 | |
|             //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)   
 | |
|             "xvcmpgtdp 2,39, 3    \n\t"
 | |
|             "xxsel     39,39,3,2  \n\t"
 | |
|             "xxsel     38,38,33,2  \n\t"
 | |
|     
 | |
|             //update index += 8 
 | |
|             "vaddudm   5,5,4       \n\t"  
 | |
| 
 | |
|             "xvabsdp  44, 44 \n\t"
 | |
|             "xvabsdp  45, 45 \n\t"
 | |
|             "xvabsdp  46, 46 \n\t"
 | |
|             "xvabsdp  47, 47 \n\t"   
 | |
| 
 | |
|             //update index += 8 
 | |
|             "vaddudm   5,5,4       \n\t"  
 | |
| 
 | |
|             "xvabsdp  48, 48 \n\t"
 | |
|             "xvabsdp  49, 49 \n\t"
 | |
|             "xvabsdp  50, 50 \n\t"
 | |
|             "xvabsdp  51, 51 \n\t"
 | |
| 
 | |
| //<-----------jump here from first load
 | |
|              "2:                  \n\t"
 | |
|     
 | |
|             "xvcmpgtdp  2,44,45  \n\t "
 | |
|             "xvcmpgtdp  3,46,47  \n\t "
 | |
|             "xvcmpgtdp  4,48,49  \n\t "
 | |
|             "xvcmpgtdp  5,50,51  \n\t"
 | |
| 
 | |
|             "xxsel    32,40,41,2 \n\t"
 | |
|             "xxsel     0,44,45,2 \n\t" 
 | |
|             "xxsel    33,42,43,3 \n\t"
 | |
|             "xxsel     1,46,47,3 \n\t" 
 | |
|             "xxsel    34,40,41,4 \n\t"
 | |
|             "xxsel    45,48,49,4 \n\t"
 | |
|             "xxsel    35,42,43,5 \n\t"
 | |
|             "xxsel    47,50,51,5 \n\t"
 | |
| 
 | |
|             "xvcmpgtdp 2,0, 1     \n\t"
 | |
|             "xvcmpgtdp 3, 45,47   \n\t"
 | |
|             "xxsel     32,32,33,2 \n\t"
 | |
|             "xxsel       0 ,0,1,2 \n\t"
 | |
|             "xxsel     34,34,35,3 \n\t"
 | |
|             "xxsel     5,45,47,3 \n\t"
 | |
| 
 | |
|             "addi     %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" 
 | |
|             // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
 | |
|             "vaddudm     2,2,4  \n\t" // vs34=vs34 + vs36{8,8} 
 | |
|           
 | |
|             //load next 64
 | |
|             "lxvd2x  44,      0,%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  45, %[i16],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  46, %[i32],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t" 
 | |
| 
 | |
|             //choose smaller from first and second part
 | |
|             "xvcmpgtdp 4, 0,5     \n\t" 
 | |
|             "xxsel     3, 0,5,4  \n\t"
 | |
|             "xxsel     33,32,34,4  \n\t"
 | |
| 
 | |
|             //load next 64
 | |
|             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"
 | |
| 
 | |
|             "vaddudm  1,1,5  \n\t"  //  get real index for first smaller  
 | |
| 
 | |
|             "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
 | |
|             "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"
 | |
| 
 | |
|  
 | |
| 
 | |
|             //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)   
 | |
|             "xvcmpgtdp 2,39, 3    \n\t"
 | |
|             "xxsel     39,39,3,2  \n\t"
 | |
|             "xxsel     38,38,33,2  \n\t"
 | |
|     
 | |
|             //update index += 8 
 | |
|             "vaddudm   5,5,4       \n\t"  
 | |
| 
 | |
|             "xvabsdp  44, 44 \n\t"
 | |
|             "xvabsdp  45, 45 \n\t"
 | |
|             "xvabsdp  46, 46 \n\t"
 | |
|             "xvabsdp  47, 47 \n\t"   
 | |
| 
 | |
|             //update index += 8 
 | |
|             "vaddudm   5,5,4       \n\t"  
 | |
| 
 | |
|             "xvabsdp  48, 48 \n\t"
 | |
|             "xvabsdp  49, 49 \n\t"
 | |
|             "xvabsdp  50, 50 \n\t"
 | |
|             "xvabsdp  51, 51 \n\t"
 | |
| 
 | |
|             //decrement n
 | |
|             "addic.    %[n], %[n], -32 \n\t"
 | |
|            
 | |
|             //Loop back if >0
 | |
|             "bgt+ 1b  \n\t"
 | |
| 
 | |
| //==============================================================================
 | |
| 
 | |
|             "xvcmpgtdp  2,44,45  \n\t "
 | |
|             "xvcmpgtdp  3,46,47  \n\t "
 | |
|             "xvcmpgtdp  4,48,49  \n\t "
 | |
|             "xvcmpgtdp  5,50,51  \n\t"
 | |
| 
 | |
|             "xxsel    32,40,41,2 \n\t"
 | |
|             "xxsel     0,44,45,2 \n\t" 
 | |
|             "xxsel    33,42,43,3 \n\t"
 | |
|             "xxsel     1,46,47,3 \n\t" 
 | |
|             "xxsel    34,40,41,4 \n\t"
 | |
|             "xxsel    45,48,49,4 \n\t"
 | |
|             "xxsel    35,42,43,5 \n\t"
 | |
|             "xxsel    47,50,51,5 \n\t"
 | |
| 
 | |
|             "xvcmpgtdp 2,0, 1     \n\t"
 | |
|             "xvcmpgtdp 3, 45,47   \n\t"
 | |
|   
 | |
|     
 | |
|             "xxsel     32,32,33,2 \n\t"
 | |
|             "xxsel       0 ,0,1,2 \n\t"
 | |
|             "xxsel     34,34,35,3 \n\t"
 | |
|             "xxsel     5,45,47,3 \n\t" 
 | |
|     
 | |
|             // for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
 | |
|             "vaddudm     2,2,4  \n\t" // vs34=vs34 + vs36{8,8}  
 | |
|             //choose smaller from first and second part
 | |
|             "xvcmpgtdp 4, 0,5     \n\t" 
 | |
|             "xxsel     3, 0,5,4   \n\t"
 | |
|             "xxsel     33,32,34,4 \n\t" 
 | |
| 
 | |
|             "vaddudm  1,1,5  \n\t"  //  get real index for first smaller   
 | |
| 
 | |
|             //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)   
 | |
|             "xvcmpgtdp 2,39, 3    \n\t"
 | |
|             "xxsel     39,39,3,2  \n\t"
 | |
|             "xxsel     38,38,33,2  \n\t"
 | |
| 
 | |
|             ///////extract min value and min index from vector
 | |
| 
 | |
|             "xxspltd   32,38,1     \n\t"
 | |
|             "xxspltd   40,39,1     \n\t"
 | |
|             "xvcmpeqdp.  2, 40,39  \n\t"
 | |
|     
 | |
|             //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
 | |
|              //0b001110=14
 | |
|             "bc 14,24, 3f  \n\t" 
 | |
|             "xvcmpgtdp  4,39, 40  \n\t"
 | |
|             "xxsel    0,39,40,4           \n\t"
 | |
|             "xxsel    1,38,32,4  \n\t"
 | |
|             "stxsdx    0,0,%[ptr_minf]     \n\t" 
 | |
|             "b 4f    \n\t"
 | |
| 
 | |
|             "3:      \n\t" 
 | |
|                 //if elements value are equal then choose minimum index
 | |
|             "xxspltd  0,40,0          \n\t"
 | |
|             "vminud   0,0,6    \n\t"  //vs32 vs38
 | |
|             "xxlor 1,32,32     \n\t"
 | |
|             "stxsdx   0,0,%[ptr_minf]  \n\t"
 | |
|           
 | |
| 
 | |
|             "4:      \n\t"
 | |
|             "mfvsrd   %[index],1 \n\t"
 | |
| 
 | |
|             : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
 | |
|             : [mem] "m"(*(const double (*)[n])x), [ptr_x] "b"(x), [ptr_minf] "b"(minf) ,
 | |
|             [i16] "b"(16), [i32] "b"(32), [i48] "b"(48),
 | |
|             [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
 | |
|             [start] "v"(start),  [adder] "v"(temp_add_index)
 | |
|             : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
 | |
|             "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
 | |
|             );
 | |
|  
 | |
|     return index;
 | |
| 
 | |
| }
 | |
| 
 | |
| 
 | |
| 
 | |
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
 | |
|     BLASLONG i = 0;
 | |
|     BLASLONG j = 0; 
 | |
|     BLASLONG min = 0;
 | |
|     FLOAT minf = 0.0;
 | |
|     
 | |
|     if (n <= 0 || inc_x <= 0) return (min);
 | |
|     minf = ABS(x[0]); //index's not incremented
 | |
|     if (inc_x == 1) {
 | |
| 
 | |
|         BLASLONG n1 = n & -32;
 | |
|         if (n1 > 0) {
 | |
| 
 | |
|             min = diamin_kernel_32(n1, x, &minf);
 | |
|             i = n1;
 | |
|         }
 | |
| 
 | |
|         while (i < n) {
 | |
|             if (ABS(x[i]) < minf) {
 | |
|                 min = i;
 | |
|                 minf = ABS(x[i]);
 | |
|             }
 | |
|             i++;
 | |
|         }
 | |
|         return (min + 1);
 | |
| 
 | |
|     } else {
 | |
| 
 | |
|         BLASLONG n1 = n & -4;
 | |
|         while (j < n1) {
 | |
| 
 | |
|             if (ABS(x[i]) < minf) {
 | |
|                 min = j;
 | |
|                 minf = ABS(x[i]);
 | |
|             }
 | |
|             if (ABS(x[i + inc_x]) < minf) {
 | |
|                 min = j + 1;
 | |
|                 minf = ABS(x[i + inc_x]);
 | |
|             }
 | |
|             if (ABS(x[i + 2 * inc_x]) < minf) {
 | |
|                 min = j + 2;
 | |
|                 minf = ABS(x[i + 2 * inc_x]);
 | |
|             }
 | |
|             if (ABS(x[i + 3 * inc_x]) < minf) {
 | |
|                 min = j + 3;
 | |
|                 minf = ABS(x[i + 3 * inc_x]);
 | |
|             }
 | |
| 
 | |
|             i += inc_x * 4;
 | |
| 
 | |
|             j += 4;
 | |
| 
 | |
|         }
 | |
| 
 | |
| 
 | |
|         while (j < n) {
 | |
|             if (ABS(x[i]) < minf) {
 | |
|                 min = j;
 | |
|                 minf = ABS(x[i]);
 | |
|             }
 | |
|             i += inc_x;
 | |
|             j++;
 | |
|         }
 | |
|         return (min + 1);
 | |
|     }
 | |
| }
 |