366 lines
11 KiB
C
366 lines
11 KiB
C
/***************************************************************************
|
|
Copyright (c) 2017, The OpenBLAS Project
|
|
All rights reserved.
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in
|
|
the documentation and/or other materials provided with the
|
|
distribution.
|
|
3. Neither the name of the OpenBLAS project nor the names of
|
|
its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*****************************************************************************/
|
|
|
|
#include "common.h"
|
|
#include <math.h>
|
|
|
|
#define ABS fabs
|
|
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
|
|
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
|
|
|
/**
|
|
* Find minimum index
|
|
* Warning: requirements n>0 and n % 16 == 0
|
|
* @param n
|
|
* @param x pointer to the vector
|
|
* @param minf (out) minimum absolute value .( only for output )
|
|
* @return minimum index
|
|
*/
|
|
static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
|
|
|
BLASLONG index;
|
|
register __vector long long start = {1,0};
|
|
register __vector long long temp_add_index = {2, 2};
|
|
__asm__(
|
|
|
|
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
|
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
|
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
|
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
|
|
|
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
|
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
|
"xxlxor 37,37 ,37 \n\t" //v5 v37 index_count
|
|
"vaddudm 10,9,%[adder] \n\t" //{5,4} vs42
|
|
"xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_min_index
|
|
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
|
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
|
|
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
|
XXSPLTD_S(36,36,0)
|
|
|
|
|
|
|
|
"xvabsdp 44, 44 \n\t"
|
|
"xvabsdp 45, 45 \n\t"
|
|
"xvabsdp 46, 46 \n\t"
|
|
"xvabsdp 47, 47 \n\t"
|
|
"xvabsdp 48, 48 \n\t"
|
|
"xvabsdp 49, 49 \n\t"
|
|
"xvabsdp 6, 6 \n\t"
|
|
"xvabsdp 7, 7 \n\t"
|
|
|
|
//jump first half forward
|
|
"b two%= \n\t"
|
|
|
|
".align 5 \n\t"
|
|
"one%=: \n\t"
|
|
|
|
|
|
XXMRGHD_S(0,44,45)
|
|
XXMRGLD_S(1,44,45)
|
|
XXMRGHD_S(2,46,47)
|
|
XXMRGLD_S(3,46,47)
|
|
XXMRGHD_S(4,48,49)
|
|
XXMRGLD_S(5,48,49)
|
|
XXMRGHD_S(44,6,7)
|
|
XXMRGLD_S(45,6,7)
|
|
|
|
"xvadddp 46, 0,1 \n\t"
|
|
"xvadddp 47, 2,3 \n\t"
|
|
"xvadddp 48, 4,5 \n\t"
|
|
"xvadddp 49, 44,45 \n\t"
|
|
|
|
|
|
|
|
"xvcmpgtdp 6,46,47 \n\t "
|
|
"xvcmpgtdp 7,48,49 \n\t "
|
|
|
|
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
|
|
|
"xxsel 32,40,41,6 \n\t"
|
|
"xxsel 0,46,47,6 \n\t"
|
|
"xxsel 33,42,43,7 \n\t"
|
|
"xxsel 1,48,49,7 \n\t"
|
|
|
|
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
|
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
|
|
|
"xvcmpgtdp 2,0,1 \n\t "
|
|
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
|
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
|
|
|
|
|
"xxsel 32,32,33,2 \n\t"
|
|
"xxsel 3,0,1,2 \n\t"
|
|
|
|
"vaddudm 0,0,5 \n\t"
|
|
|
|
//cmp with previous
|
|
|
|
"xvcmpgtdp 4,39,3 \n\t "
|
|
"vaddudm 5,5,4 \n\t"
|
|
|
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
|
//select with previous
|
|
"xxsel 38,38,32,4 \n\t"
|
|
"xxsel 39,39,3,4 \n\t"
|
|
|
|
|
|
|
|
|
|
"xvabsdp 44, 44 \n\t"
|
|
"xvabsdp 45, 45 \n\t"
|
|
"xvabsdp 46, 46 \n\t"
|
|
"xvabsdp 47, 47 \n\t"
|
|
"xvabsdp 48, 48 \n\t"
|
|
"xvabsdp 49, 49 \n\t"
|
|
"xvabsdp 6, 6 \n\t"
|
|
"xvabsdp 7, 7 \n\t"
|
|
|
|
|
|
//>>/////////////////////////////// half start
|
|
"two%=: \n\t"
|
|
XXMRGHD_S(0,44,45)
|
|
XXMRGLD_S(1,44,45)
|
|
XXMRGHD_S(2,46,47)
|
|
XXMRGLD_S(3,46,47)
|
|
XXMRGHD_S(4,48,49)
|
|
XXMRGLD_S(5,48,49)
|
|
XXMRGHD_S(44,6,7)
|
|
XXMRGLD_S(45,6,7)
|
|
|
|
"xvadddp 46, 0,1 \n\t"
|
|
"xvadddp 47, 2,3 \n\t"
|
|
"xvadddp 48, 4,5 \n\t"
|
|
"xvadddp 49, 44,45 \n\t"
|
|
|
|
"xvcmpgtdp 6,46,47 \n\t "
|
|
"xvcmpgtdp 7,48,49 \n\t "
|
|
|
|
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
|
|
|
"xxsel 32,40,41,6 \n\t"
|
|
"xxsel 0,46,47,6 \n\t"
|
|
"xxsel 33,42,43,7 \n\t"
|
|
"xxsel 1,48,49,7 \n\t"
|
|
|
|
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
|
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
|
|
|
"xvcmpgtdp 2,0,1 \n\t "
|
|
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
|
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
|
|
|
|
|
"xxsel 32,32,33,2 \n\t"
|
|
"xxsel 3,0,1,2 \n\t"
|
|
|
|
"vaddudm 0,0,5 \n\t"
|
|
|
|
//cmp with previous
|
|
|
|
"xvcmpgtdp 4,39,3 \n\t "
|
|
"vaddudm 5,5,4 \n\t"
|
|
|
|
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
|
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
|
"lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
|
|
"lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
|
|
//select with previous
|
|
"xxsel 38,38,32,4 \n\t"
|
|
"xxsel 39,39,3,4 \n\t"
|
|
|
|
|
|
"xvabsdp 44, 44 \n\t"
|
|
"xvabsdp 45, 45 \n\t"
|
|
"xvabsdp 46, 46 \n\t"
|
|
"xvabsdp 47, 47 \n\t"
|
|
"xvabsdp 48, 48 \n\t"
|
|
"xvabsdp 49, 49 \n\t"
|
|
"xvabsdp 6, 6 \n\t"
|
|
"xvabsdp 7, 7 \n\t"
|
|
|
|
|
|
//decrement n
|
|
"addic. %[n], %[n], -16 \n\t"
|
|
//Loop back if >0
|
|
"bgt+ one%= \n\t"
|
|
|
|
|
|
XXMRGHD_S(0,44,45)
|
|
XXMRGLD_S(1,44,45)
|
|
XXMRGHD_S(2,46,47)
|
|
XXMRGLD_S(3,46,47)
|
|
XXMRGHD_S(4,48,49)
|
|
XXMRGLD_S(5,48,49)
|
|
XXMRGHD_S(44,6,7)
|
|
XXMRGLD_S(45,6,7)
|
|
|
|
"xvadddp 46, 0,1 \n\t"
|
|
"xvadddp 47, 2,3 \n\t"
|
|
"xvadddp 48, 4,5 \n\t"
|
|
"xvadddp 49, 44,45 \n\t"
|
|
|
|
|
|
|
|
"xvcmpgtdp 6,46,47 \n\t "
|
|
"xvcmpgtdp 7,48,49 \n\t "
|
|
|
|
"xxsel 32,40,41,6 \n\t"
|
|
"xxsel 0,46,47,6 \n\t"
|
|
"xxsel 33,42,43,7 \n\t"
|
|
"xxsel 1,48,49,7 \n\t"
|
|
|
|
"xvcmpgtdp 2,0,1 \n\t "
|
|
"xxsel 32,32,33,2 \n\t"
|
|
"xxsel 3,0,1,2 \n\t"
|
|
|
|
"vaddudm 0,0,5 \n\t"
|
|
|
|
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
|
//cmp with previous
|
|
|
|
"xvcmpgtdp 4,39,3 \n\t "
|
|
"vaddudm 5,5,4 \n\t"
|
|
"xxsel 38,38,32,4 \n\t"
|
|
"xxsel 39,39,3,4 \n\t"
|
|
|
|
|
|
///////extract min value and min index from vector
|
|
|
|
XXSPLTD_S(32,38,1)
|
|
XXSPLTD_S(40,39,1)
|
|
"xvcmpeqdp. 2, 40,39 \n\t"
|
|
|
|
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
|
//0b001110=14
|
|
"bc 14,24, three%= \n\t"
|
|
"xvcmpgtdp 4,39, 40 \n\t"
|
|
"xxsel 0,39,40,4 \n\t"
|
|
"xxsel 1,38,32,4 \n\t"
|
|
"stxsdx 0,0,%[ptr_minf] \n\t"
|
|
"b four%= \n\t"
|
|
|
|
"three%=: \n\t"
|
|
//if elements value are equal then choose minimum index
|
|
XXSPLTD_S(0,40,0)
|
|
"vminud 0,0,6 \n\t" //vs32 vs38
|
|
"xxlor 1,32,32 \n\t"
|
|
"stxsdx 0,0,%[ptr_minf] \n\t"
|
|
|
|
|
|
"four%=: \n\t"
|
|
"mfvsrd %[index],1 \n\t"
|
|
|
|
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
|
: [mem] "m"(*(const double (*)[2*n])x), [ptr_x] "b"(x), [ptr_minf] "b"(minf) ,
|
|
[i16] "b"(16), [i32] "b"(32), [i48] "b"(48),
|
|
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
|
[start] "v"(start), [adder] "v"(temp_add_index)
|
|
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
|
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
|
|
);
|
|
|
|
return index;
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|
{
|
|
BLASLONG i=0;
|
|
BLASLONG ix=0;
|
|
FLOAT minf;
|
|
BLASLONG min=0;
|
|
BLASLONG inc_x2;
|
|
|
|
if (n <= 0 || inc_x <= 0) return(min);
|
|
|
|
|
|
if (inc_x == 1) {
|
|
minf = CABS1(x,0); //index will not be incremented
|
|
|
|
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
|
|
|
BLASLONG n1 = n & -16;
|
|
if (n1 > 0) {
|
|
|
|
min = ziamin_kernel_16_TUNED(n1, x, &minf);
|
|
i = n1;
|
|
ix = n1 << 1;
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
while(i < n)
|
|
{
|
|
if( CABS1(x,ix) < minf )
|
|
{
|
|
min = i;
|
|
minf = CABS1(x,ix);
|
|
}
|
|
ix += 2;
|
|
i++;
|
|
}
|
|
return (min + 1);
|
|
|
|
} else {
|
|
|
|
inc_x2 = 2 * inc_x;
|
|
|
|
minf = CABS1(x,0);
|
|
ix += inc_x2;
|
|
i++;
|
|
|
|
while(i < n)
|
|
{
|
|
if( CABS1(x,ix) < minf )
|
|
{
|
|
min = i;
|
|
minf = CABS1(x,ix);
|
|
}
|
|
ix += inc_x2;
|
|
i++;
|
|
}
|
|
return (min + 1);
|
|
}
|
|
|
|
}
|