refactor: do some internal refactor.
This commit is contained in:
parent
f90fa07ea9
commit
c7560202f1
|
@ -265,7 +265,7 @@ int32_t tsDecompressINTImp(const char *const input, const int32_t nelements, cha
|
||||||
int64_t prev_value = 0;
|
int64_t prev_value = 0;
|
||||||
|
|
||||||
while (1) {
|
while (1) {
|
||||||
if (count == nelements) break;
|
if (_pos == nelements) break;
|
||||||
|
|
||||||
uint64_t w = 0;
|
uint64_t w = 0;
|
||||||
memcpy(&w, ip, LONG_BYTES);
|
memcpy(&w, ip, LONG_BYTES);
|
||||||
|
@ -284,8 +284,8 @@ int32_t tsDecompressINTImp(const char *const input, const int32_t nelements, cha
|
||||||
int64_t* p = (int64_t*) output;
|
int64_t* p = (int64_t*) output;
|
||||||
|
|
||||||
if (selector == 0 || selector == 1) {
|
if (selector == 0 || selector == 1) {
|
||||||
int32_t gRemainder = nelements - count;
|
int32_t gRemainder = nelements - _pos;
|
||||||
int32_t num = gRemainder > elems? elems:gRemainder;
|
int32_t num = gRemainder < elems? gRemainder:elems;
|
||||||
|
|
||||||
int32_t batch = num >> 2;
|
int32_t batch = num >> 2;
|
||||||
int32_t remainder = num & 0x03;
|
int32_t remainder = num & 0x03;
|
||||||
|
@ -302,100 +302,68 @@ int32_t tsDecompressINTImp(const char *const input, const int32_t nelements, cha
|
||||||
|
|
||||||
count += num;
|
count += num;
|
||||||
} else {
|
} else {
|
||||||
int32_t gRemainder = (nelements - count);
|
int32_t gRemainder = (nelements - _pos);
|
||||||
int32_t num = gRemainder > elems? elems:gRemainder;
|
int32_t num = (gRemainder > elems)? elems:gRemainder;
|
||||||
|
|
||||||
int32_t batch = num >> 2;
|
int32_t batch = num >> 2;
|
||||||
int32_t remain = num & 0x03;
|
int32_t remain = num & 0x03;
|
||||||
#if 1
|
|
||||||
#if 1
|
#if 1
|
||||||
__m256i base = _mm256_set1_epi64x(w);
|
__m256i base = _mm256_set1_epi64x(w);
|
||||||
__m256i mask_ = _mm256_set1_epi64x(mask);
|
__m256i maskVal = _mm256_set1_epi64x(mask);
|
||||||
|
|
||||||
__m256i shiftBits = _mm256_set_epi64x(bit * 3 + 4, bit * 2 + 4, bit + 4, 4);
|
__m256i shiftBits = _mm256_set_epi64x(bit * 3 + 4, bit * 2 + 4, bit + 4, 4);
|
||||||
__m256i inc = _mm256_set1_epi64x(bit << 2);
|
__m256i inc = _mm256_set1_epi64x(bit << 2);
|
||||||
|
|
||||||
for(int32_t i = 0; i < batch; ++i) {
|
for(int32_t i = 0; i < batch; ++i) {
|
||||||
__m256i after = _mm256_srlv_epi64(base, shiftBits);
|
__m256i after = _mm256_srlv_epi64(base, shiftBits);
|
||||||
__m256i zz = _mm256_and_si256(after, mask_);
|
__m256i zigzagVal= _mm256_and_si256(after, maskVal);
|
||||||
printf("1\n");
|
|
||||||
|
|
||||||
//#define ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1))) // zigzag decode
|
// ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1)))
|
||||||
__m256i signmask = _mm256_and_si256(_mm256_set_epi64x(1, 1, 1, 1), zz);
|
__m256i signmask = _mm256_and_si256(_mm256_set1_epi64x(1), zigzagVal);
|
||||||
signmask = _mm256_sub_epi64(_mm256_setzero_si256(), signmask);
|
signmask = _mm256_sub_epi64(_mm256_setzero_si256(), signmask);
|
||||||
|
// get the four zigzag values here
|
||||||
|
__m256i delta = _mm256_xor_si256(_mm256_srli_epi64(zigzagVal, 1), signmask);
|
||||||
|
|
||||||
// now here we get the four zigzag value
|
// calculate the cumulative sum (prefix sum) for each number
|
||||||
__m256i final = _mm256_xor_si256(_mm256_srli_epi64(zz, 1), signmask);
|
|
||||||
|
|
||||||
// calculate the cumulative sum (prefix sum)
|
|
||||||
// decode[0] = prev_value + final[0]
|
// decode[0] = prev_value + final[0]
|
||||||
// decode[1] = decode[0] + final[1] -----> prev_value + final[0] + final[1]
|
// decode[1] = decode[0] + final[1] -----> prev_value + final[0] + final[1]
|
||||||
// decode[2] = decode[1] + final[1] -----> prev_value + final[0] + final[1] + final[2]
|
// decode[2] = decode[1] + final[1] -----> prev_value + final[0] + final[1] + final[2]
|
||||||
// decode[3] = decode[2] + final[1] -----> prev_value + final[0] + final[1] + final[2] + final[3]
|
// decode[3] = decode[2] + final[1] -----> prev_value + final[0] + final[1] + final[2] + final[3]
|
||||||
|
|
||||||
printf("2\n");
|
// 1, 2, 3, 4
|
||||||
|
//+ 0, 1, 2, 3
|
||||||
|
// 1, 3, 5, 7
|
||||||
|
// shift and add for the first round
|
||||||
__m128i prev = _mm_set1_epi64x(prev_value);
|
__m128i prev = _mm_set1_epi64x(prev_value);
|
||||||
final = _mm256_add_epi64(final, _mm256_slli_si256(final, 8));
|
delta = _mm256_add_epi64(delta, _mm256_slli_si256(delta, 8));
|
||||||
// x = 1, 2, 3, 4
|
_mm256_storeu_si256((__m256i *)&p[_pos], delta);
|
||||||
// + 0, 1, 2, 3
|
|
||||||
// = 1, 3, 5, 7
|
|
||||||
_mm256_storeu_si256((__m256i *)&p[_pos], final);
|
|
||||||
|
|
||||||
__m128i first = _mm_loadu_si128((__m128i *)&p[_pos]);
|
// 1, 3, 5, 7
|
||||||
__m128i sec = _mm_add_epi64(_mm_loadu_si128((__m128i *)&p[_pos + 2]), first);
|
//+ 0, 0, 1, 3
|
||||||
sec = _mm_add_epi64(sec, prev);
|
// 1, 3, 6, 10
|
||||||
first = _mm_add_epi64(first, prev);
|
// shift and add operation for the second round
|
||||||
|
__m128i firstPart = _mm_loadu_si128((__m128i *)&p[_pos]);
|
||||||
|
__m128i secPart = _mm_add_epi64(_mm_loadu_si128((__m128i *)&p[_pos + 2]), firstPart);
|
||||||
|
firstPart = _mm_add_epi64(firstPart, prev);
|
||||||
|
secPart = _mm_add_epi64(secPart, prev);
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i *)&p[_pos], first);
|
// save it in the memory
|
||||||
_mm_storeu_si128((__m128i *)&p[_pos + 2], sec);
|
_mm_storeu_si128((__m128i *)&p[_pos], firstPart);
|
||||||
|
_mm_storeu_si128((__m128i *)&p[_pos + 2], secPart);
|
||||||
|
|
||||||
shiftBits = _mm256_add_epi64(shiftBits, inc);
|
shiftBits = _mm256_add_epi64(shiftBits, inc);
|
||||||
prev_value = p[_pos + 3];
|
prev_value = p[_pos + 3];
|
||||||
_pos += 4;
|
_pos += 4;
|
||||||
|
|
||||||
printf("3\n");
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
// manual unrolling, to erase the hotspot
|
|
||||||
uint64_t zz[4];
|
|
||||||
|
|
||||||
for (int32_t i = 0; i < batch; ++i) {
|
|
||||||
zigzag_value = ((w >> v) & mask);
|
|
||||||
zz[0] = ZIGZAG_DECODE(int64_t, zigzag_value);
|
|
||||||
|
|
||||||
v += bit;
|
|
||||||
zigzag_value = ((w >> v) & mask);
|
|
||||||
zz[1] = ZIGZAG_DECODE(int64_t, zigzag_value);
|
|
||||||
|
|
||||||
v += bit;
|
|
||||||
zigzag_value = ((w >> v) & mask);
|
|
||||||
zz[2] = ZIGZAG_DECODE(int64_t, zigzag_value);
|
|
||||||
|
|
||||||
v += bit;
|
|
||||||
zigzag_value = ((w >> v) & mask);
|
|
||||||
zz[3] = ZIGZAG_DECODE(int64_t, zigzag_value);
|
|
||||||
|
|
||||||
p[_pos] = prev_value + zz[0];
|
|
||||||
p[_pos + 1] = p[_pos] + zz[1];
|
|
||||||
p[_pos + 2] = p[_pos + 1] + zz[2];
|
|
||||||
p[_pos + 3] = p[_pos + 2] + zz[3];
|
|
||||||
prev_value = p[_pos + 3];
|
|
||||||
v += bit;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// handle the remain
|
// handle the remain value
|
||||||
for (int32_t i = 0; i < remain; i++) {
|
for (int32_t i = 0; i < remain; i++) {
|
||||||
zigzag_value = ((w >> v) & mask);
|
zigzag_value = ((w >> (v + (batch * bit))) & mask);
|
||||||
prev_value += ZIGZAG_DECODE(int64_t, zigzag_value);
|
prev_value += ZIGZAG_DECODE(int64_t, zigzag_value);
|
||||||
|
|
||||||
p[_pos++] = prev_value;
|
p[_pos++] = prev_value;
|
||||||
v += bit;
|
v += bit;
|
||||||
}
|
}
|
||||||
|
|
||||||
count += num;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
|
for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
|
||||||
zigzag_value = ((w >> v) & mask);
|
zigzag_value = ((w >> v) & mask);
|
||||||
|
|
Loading…
Reference in New Issue