510 lines
16 KiB
C
510 lines
16 KiB
C
/*
|
|
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
|
|
*
|
|
* This program is free software: you can use, redistribute, and/or modify
|
|
* it under the terms of the GNU Affero General Public License, version 3
|
|
* or later ("AGPL"), as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE.
|
|
*
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "os.h"
|
|
#include "ttypes.h"
|
|
#include "tcompression.h"
|
|
|
|
int32_t getWordLength(char type) {
|
|
int32_t wordLength = 0;
|
|
switch (type) {
|
|
case TSDB_DATA_TYPE_BIGINT:
|
|
wordLength = LONG_BYTES;
|
|
break;
|
|
case TSDB_DATA_TYPE_INT:
|
|
wordLength = INT_BYTES;
|
|
break;
|
|
case TSDB_DATA_TYPE_SMALLINT:
|
|
wordLength = SHORT_BYTES;
|
|
break;
|
|
case TSDB_DATA_TYPE_TINYINT:
|
|
wordLength = CHAR_BYTES;
|
|
break;
|
|
default:
|
|
uError("Invalid decompress integer type:%d", type);
|
|
return -1;
|
|
}
|
|
|
|
return wordLength;
|
|
}
|
|
|
|
int32_t tsDecompressIntImpl_Hw(const char *const input, const int32_t nelements, char *const output, const char type) {
|
|
int32_t word_length = getWordLength(type);
|
|
|
|
// Selector value: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
|
char bit_per_integer[] = {0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 20, 30, 60};
|
|
int32_t selector_to_elems[] = {240, 120, 60, 30, 20, 15, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1};
|
|
|
|
const char *ip = input + 1;
|
|
int32_t count = 0;
|
|
int32_t _pos = 0;
|
|
int64_t prevValue = 0;
|
|
|
|
#if __AVX2__
|
|
while (_pos < nelements) {
|
|
uint64_t w = *(uint64_t*) ip;
|
|
|
|
char selector = (char)(w & INT64MASK(4)); // selector = 4
|
|
char bit = bit_per_integer[(int32_t)selector]; // bit = 3
|
|
int32_t elems = selector_to_elems[(int32_t)selector];
|
|
|
|
// Optimize the performance, by remove the constantly switch operation.
|
|
int32_t v = 4;
|
|
uint64_t zigzag_value = 0;
|
|
uint64_t mask = INT64MASK(bit);
|
|
|
|
switch (type) {
|
|
case TSDB_DATA_TYPE_BIGINT: {
|
|
int64_t* p = (int64_t*) output;
|
|
|
|
int32_t gRemainder = (nelements - _pos);
|
|
int32_t num = (gRemainder > elems)? elems:gRemainder;
|
|
|
|
int32_t batch = num >> 2;
|
|
int32_t remain = num & 0x03;
|
|
if (selector == 0 || selector == 1) {
|
|
if (tsSIMDEnable && tsAVX2Enable) {
|
|
for (int32_t i = 0; i < batch; ++i) {
|
|
__m256i prev = _mm256_set1_epi64x(prevValue);
|
|
_mm256_storeu_si256((__m256i *)&p[_pos], prev);
|
|
_pos += 4;
|
|
}
|
|
|
|
for (int32_t i = 0; i < remain; ++i) {
|
|
p[_pos++] = prevValue;
|
|
}
|
|
} else if (tsSIMDEnable && tsAVX512Enable) {
|
|
#if __AVX512F__
|
|
// todo add avx512 impl
|
|
#endif
|
|
} else { // alternative implementation without SIMD instructions.
|
|
for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
|
|
p[_pos++] = prevValue;
|
|
v += bit;
|
|
}
|
|
}
|
|
} else {
|
|
if (tsSIMDEnable && tsAVX2Enable) {
|
|
__m256i base = _mm256_set1_epi64x(w);
|
|
__m256i maskVal = _mm256_set1_epi64x(mask);
|
|
|
|
__m256i shiftBits = _mm256_set_epi64x(bit * 3 + 4, bit * 2 + 4, bit + 4, 4);
|
|
__m256i inc = _mm256_set1_epi64x(bit << 2);
|
|
|
|
for (int32_t i = 0; i < batch; ++i) {
|
|
__m256i after = _mm256_srlv_epi64(base, shiftBits);
|
|
__m256i zigzagVal = _mm256_and_si256(after, maskVal);
|
|
|
|
// ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1)))
|
|
__m256i signmask = _mm256_and_si256(_mm256_set1_epi64x(1), zigzagVal);
|
|
signmask = _mm256_sub_epi64(_mm256_setzero_si256(), signmask);
|
|
|
|
// get four zigzag values here
|
|
__m256i delta = _mm256_xor_si256(_mm256_srli_epi64(zigzagVal, 1), signmask);
|
|
|
|
// calculate the cumulative sum (prefix sum) for each number
|
|
// decode[0] = prevValue + final[0]
|
|
// decode[1] = decode[0] + final[1] -----> prevValue + final[0] + final[1]
|
|
// decode[2] = decode[1] + final[2] -----> prevValue + final[0] + final[1] + final[2]
|
|
// decode[3] = decode[2] + final[3] -----> prevValue + final[0] + final[1] + final[2] + final[3]
|
|
|
|
// 1, 2, 3, 4
|
|
//+ 0, 1, 0, 3
|
|
// 1, 3, 3, 7
|
|
// shift and add for the first round
|
|
__m128i prev = _mm_set1_epi64x(prevValue);
|
|
__m256i x = _mm256_slli_si256(delta, 8);
|
|
|
|
delta = _mm256_add_epi64(delta, x);
|
|
_mm256_storeu_si256((__m256i *)&p[_pos], delta);
|
|
|
|
// 1, 3, 3, 7
|
|
//+ 0, 0, 3, 3
|
|
// 1, 3, 6, 10
|
|
// shift and add operation for the second round
|
|
__m128i firstPart = _mm_loadu_si128((__m128i *)&p[_pos]);
|
|
__m128i secondItem = _mm_set1_epi64x(p[_pos + 1]);
|
|
__m128i secPart = _mm_add_epi64(_mm_loadu_si128((__m128i *)&p[_pos + 2]), secondItem);
|
|
firstPart = _mm_add_epi64(firstPart, prev);
|
|
secPart = _mm_add_epi64(secPart, prev);
|
|
|
|
// save it in the memory
|
|
_mm_storeu_si128((__m128i *)&p[_pos], firstPart);
|
|
_mm_storeu_si128((__m128i *)&p[_pos + 2], secPart);
|
|
|
|
shiftBits = _mm256_add_epi64(shiftBits, inc);
|
|
prevValue = p[_pos + 3];
|
|
_pos += 4;
|
|
}
|
|
|
|
// handle the remain value
|
|
for (int32_t i = 0; i < remain; i++) {
|
|
zigzag_value = ((w >> (v + (batch * bit * 4))) & mask);
|
|
prevValue += ZIGZAG_DECODE(int64_t, zigzag_value);
|
|
|
|
p[_pos++] = prevValue;
|
|
v += bit;
|
|
}
|
|
} else if (tsSIMDEnable && tsAVX512Enable) {
|
|
#if __AVX512F__
|
|
// todo add avx512 impl
|
|
#endif
|
|
} else { // alternative implementation without SIMD instructions.
|
|
for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
|
|
zigzag_value = ((w >> v) & mask);
|
|
prevValue += ZIGZAG_DECODE(int64_t, zigzag_value);
|
|
|
|
p[_pos++] = prevValue;
|
|
v += bit;
|
|
}
|
|
}
|
|
}
|
|
} break;
|
|
case TSDB_DATA_TYPE_INT: {
|
|
int32_t* p = (int32_t*) output;
|
|
|
|
if (selector == 0 || selector == 1) {
|
|
for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
|
|
p[_pos++] = (int32_t)prevValue;
|
|
}
|
|
} else {
|
|
for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
|
|
zigzag_value = ((w >> v) & mask);
|
|
prevValue += ZIGZAG_DECODE(int64_t, zigzag_value);
|
|
|
|
p[_pos++] = (int32_t)prevValue;
|
|
v += bit;
|
|
}
|
|
}
|
|
} break;
|
|
case TSDB_DATA_TYPE_SMALLINT: {
|
|
int16_t* p = (int16_t*) output;
|
|
|
|
if (selector == 0 || selector == 1) {
|
|
for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
|
|
p[_pos++] = (int16_t)prevValue;
|
|
}
|
|
} else {
|
|
for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
|
|
zigzag_value = ((w >> v) & mask);
|
|
prevValue += ZIGZAG_DECODE(int64_t, zigzag_value);
|
|
|
|
p[_pos++] = (int16_t)prevValue;
|
|
v += bit;
|
|
}
|
|
}
|
|
} break;
|
|
|
|
case TSDB_DATA_TYPE_TINYINT: {
|
|
int8_t *p = (int8_t *)output;
|
|
|
|
if (selector == 0 || selector == 1) {
|
|
for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
|
|
p[_pos++] = (int8_t)prevValue;
|
|
}
|
|
} else {
|
|
for (int32_t i = 0; i < elems && count < nelements; i++, count++) {
|
|
zigzag_value = ((w >> v) & mask);
|
|
prevValue += ZIGZAG_DECODE(int64_t, zigzag_value);
|
|
|
|
p[_pos++] = (int8_t)prevValue;
|
|
v += bit;
|
|
}
|
|
}
|
|
} break;
|
|
}
|
|
|
|
ip += LONG_BYTES;
|
|
}
|
|
|
|
#endif
|
|
return nelements * word_length;
|
|
}
|
|
|
|
int32_t tsDecompressFloatImplAvx512(const char *const input, const int32_t nelements, char *const output) {
|
|
#if __AVX512F__
|
|
// todo add it
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
// todo add later
|
|
int32_t tsDecompressFloatImplAvx2(const char *const input, const int32_t nelements, char *const output) {
|
|
#if __AVX2__
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
int32_t tsDecompressTimestampAvx2(const char *const input, const int32_t nelements, char *const output,
|
|
bool bigEndian) {
|
|
#if 0
|
|
int64_t *ostream = (int64_t *)output;
|
|
int32_t ipos = 1, opos = 0;
|
|
__m128i prevVal = _mm_setzero_si128();
|
|
__m128i prevDelta = _mm_setzero_si128();
|
|
|
|
#if __AVX2__
|
|
int32_t batch = nelements >> 1;
|
|
int32_t remainder = nelements & 0x01;
|
|
__mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff};
|
|
|
|
int32_t i = 0;
|
|
if (batch > 1) {
|
|
// first loop
|
|
uint8_t flags = input[ipos++];
|
|
|
|
int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7
|
|
int8_t nbytes2 = (flags >> 4) & INT8MASK(4);
|
|
|
|
__m128i data1;
|
|
if (nbytes1 == 0) {
|
|
data1 = _mm_setzero_si128();
|
|
} else {
|
|
memcpy(&data1, (const void*) (input + ipos), nbytes1);
|
|
}
|
|
|
|
__m128i data2;
|
|
if (nbytes2 == 0) {
|
|
data2 = _mm_setzero_si128();
|
|
} else {
|
|
memcpy(&data2, (const void*) (input + ipos + nbytes1), nbytes2);
|
|
}
|
|
|
|
data2 = _mm_broadcastq_epi64(data2);
|
|
__m128i zzVal = _mm_blend_epi32(data2, data1, 0x03);
|
|
|
|
// ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1)))
|
|
__m128i signmask = _mm_and_si128(_mm_set1_epi64x(1), zzVal);
|
|
signmask = _mm_sub_epi64(_mm_setzero_si128(), signmask);
|
|
|
|
// get two zigzag values here
|
|
__m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask);
|
|
|
|
__m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta);
|
|
deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent);
|
|
|
|
__m128i val = _mm_add_epi64(deltaCurrent, prevVal);
|
|
_mm_storeu_si128((__m128i *)&ostream[opos], val);
|
|
|
|
// keep the previous value
|
|
prevVal = _mm_shuffle_epi32 (val, 0xEE);
|
|
|
|
// keep the previous delta of delta, for the first item
|
|
prevDelta = _mm_shuffle_epi32(deltaOfDelta, 0xEE);
|
|
|
|
opos += 2;
|
|
ipos += nbytes1 + nbytes2;
|
|
i += 1;
|
|
}
|
|
|
|
// the remain
|
|
for(; i < batch; ++i) {
|
|
uint8_t flags = input[ipos++];
|
|
|
|
int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7
|
|
int8_t nbytes2 = (flags >> 4) & INT8MASK(4);
|
|
|
|
// __m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos));
|
|
// __m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1));
|
|
__m128i data1;
|
|
if (nbytes1 == 0) {
|
|
data1 = _mm_setzero_si128();
|
|
} else {
|
|
int64_t dd = 0;
|
|
memcpy(&dd, (const void*) (input + ipos), nbytes1);
|
|
data1 = _mm_loadu_si64(&dd);
|
|
}
|
|
|
|
__m128i data2;
|
|
if (nbytes2 == 0) {
|
|
data2 = _mm_setzero_si128();
|
|
} else {
|
|
int64_t dd = 0;
|
|
memcpy(&dd, (const void*) (input + ipos + nbytes1), nbytes2);
|
|
data2 = _mm_loadu_si64(&dd);
|
|
}
|
|
|
|
data2 = _mm_broadcastq_epi64(data2);
|
|
|
|
__m128i zzVal = _mm_blend_epi32(data2, data1, 0x03);
|
|
|
|
// ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1)))
|
|
__m128i signmask = _mm_and_si128(_mm_set1_epi64x(1), zzVal);
|
|
signmask = _mm_sub_epi64(_mm_setzero_si128(), signmask);
|
|
|
|
// get two zigzag values here
|
|
__m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask);
|
|
|
|
__m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta);
|
|
deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent);
|
|
|
|
__m128i val = _mm_add_epi64(deltaCurrent, prevVal);
|
|
_mm_storeu_si128((__m128i *)&ostream[opos], val);
|
|
|
|
// keep the previous value
|
|
prevVal = _mm_shuffle_epi32 (val, 0xEE);
|
|
|
|
// keep the previous delta of delta
|
|
__m128i delta = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaOfDelta);
|
|
prevDelta = _mm_shuffle_epi32(_mm_add_epi64(delta, prevDelta), 0xEE);
|
|
|
|
opos += 2;
|
|
ipos += nbytes1 + nbytes2;
|
|
}
|
|
|
|
if (remainder > 0) {
|
|
uint64_t dd = 0;
|
|
uint8_t flags = input[ipos++];
|
|
|
|
int32_t nbytes = flags & INT8MASK(4);
|
|
int64_t deltaOfDelta = 0;
|
|
if (nbytes == 0) {
|
|
deltaOfDelta = 0;
|
|
} else {
|
|
// if (is_bigendian()) {
|
|
// memcpy(((char *)(&dd1)) + longBytes - nbytes, input + ipos, nbytes);
|
|
// } else {
|
|
memcpy(&dd, input + ipos, nbytes);
|
|
// }
|
|
deltaOfDelta = ZIGZAG_DECODE(int64_t, dd);
|
|
}
|
|
|
|
ipos += nbytes;
|
|
if (opos == 0) {
|
|
ostream[opos++] = deltaOfDelta;
|
|
} else {
|
|
int64_t prevDeltaX = deltaOfDelta + prevDelta[1];
|
|
ostream[opos++] = prevVal[1] + prevDeltaX;
|
|
}
|
|
}
|
|
#endif
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
int32_t tsDecompressTimestampAvx512(const char *const input, const int32_t nelements, char *const output,
|
|
bool UNUSED_PARAM(bigEndian)) {
|
|
int64_t *ostream = (int64_t *)output;
|
|
int32_t ipos = 1, opos = 0;
|
|
|
|
#if __AVX512VL__
|
|
|
|
__m128i prevVal = _mm_setzero_si128();
|
|
__m128i prevDelta = _mm_setzero_si128();
|
|
|
|
int32_t numOfBatch = nelements >> 1;
|
|
int32_t remainder = nelements & 0x01;
|
|
__mmask16 mask2[16] = {0, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff};
|
|
|
|
int32_t i = 0;
|
|
if (numOfBatch > 1) {
|
|
// first loop
|
|
uint8_t flags = input[ipos++];
|
|
|
|
int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7
|
|
int8_t nbytes2 = (flags >> 4) & INT8MASK(4);
|
|
|
|
__m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos));
|
|
__m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1));
|
|
data2 = _mm_broadcastq_epi64(data2);
|
|
|
|
__m128i zzVal = _mm_blend_epi32(data2, data1, 0x03);
|
|
|
|
// ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1)))
|
|
__m128i signmask = _mm_and_si128(_mm_set1_epi64x(1), zzVal);
|
|
signmask = _mm_sub_epi64(_mm_setzero_si128(), signmask);
|
|
|
|
// get two zigzag values here
|
|
__m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask);
|
|
|
|
__m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta);
|
|
deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent);
|
|
|
|
__m128i val = _mm_add_epi64(deltaCurrent, prevVal);
|
|
_mm_storeu_si128((__m128i *)&ostream[opos], val);
|
|
|
|
// keep the previous value
|
|
prevVal = _mm_shuffle_epi32 (val, 0xEE);
|
|
|
|
// keep the previous delta of delta, for the first item
|
|
prevDelta = _mm_shuffle_epi32(deltaOfDelta, 0xEE);
|
|
|
|
opos += 2;
|
|
ipos += nbytes1 + nbytes2;
|
|
i += 1;
|
|
}
|
|
|
|
// the remain
|
|
for(; i < numOfBatch; ++i) {
|
|
uint8_t flags = input[ipos++];
|
|
|
|
int8_t nbytes1 = flags & INT8MASK(4); // range of nbytes starts from 0 to 7
|
|
int8_t nbytes2 = (flags >> 4) & INT8MASK(4);
|
|
|
|
__m128i data1 = _mm_maskz_loadu_epi8(mask2[nbytes1], (const void*)(input + ipos));
|
|
__m128i data2 = _mm_maskz_loadu_epi8(mask2[nbytes2], (const void*)(input + ipos + nbytes1));
|
|
data2 = _mm_broadcastq_epi64(data2);
|
|
|
|
__m128i zzVal = _mm_blend_epi32(data2, data1, 0x03);
|
|
|
|
// ZIGZAG_DECODE(T, v) (((v) >> 1) ^ -((T)((v)&1)))
|
|
__m128i signmask = _mm_and_si128(_mm_set1_epi64x(1), zzVal);
|
|
signmask = _mm_sub_epi64(_mm_setzero_si128(), signmask);
|
|
|
|
// get two zigzag values here
|
|
__m128i deltaOfDelta = _mm_xor_si128(_mm_srli_epi64(zzVal, 1), signmask);
|
|
|
|
__m128i deltaCurrent = _mm_add_epi64(deltaOfDelta, prevDelta);
|
|
deltaCurrent = _mm_add_epi64(_mm_slli_si128(deltaCurrent, 8), deltaCurrent);
|
|
|
|
__m128i val = _mm_add_epi64(deltaCurrent, prevVal);
|
|
_mm_storeu_si128((__m128i *)&ostream[opos], val);
|
|
|
|
// keep the previous value
|
|
prevVal = _mm_shuffle_epi32 (val, 0xEE);
|
|
|
|
// keep the previous delta of delta
|
|
__m128i delta = _mm_add_epi64(_mm_slli_si128(deltaOfDelta, 8), deltaOfDelta);
|
|
prevDelta = _mm_shuffle_epi32(_mm_add_epi64(delta, prevDelta), 0xEE);
|
|
|
|
opos += 2;
|
|
ipos += nbytes1 + nbytes2;
|
|
}
|
|
|
|
if (remainder > 0) {
|
|
uint64_t dd = 0;
|
|
uint8_t flags = input[ipos++];
|
|
|
|
int32_t nbytes = flags & INT8MASK(4);
|
|
int64_t deltaOfDelta = 0;
|
|
if (nbytes == 0) {
|
|
deltaOfDelta = 0;
|
|
} else {
|
|
memcpy(&dd, input + ipos, nbytes);
|
|
deltaOfDelta = ZIGZAG_DECODE(int64_t, dd);
|
|
}
|
|
|
|
ipos += nbytes;
|
|
if (opos == 0) {
|
|
ostream[opos++] = deltaOfDelta;
|
|
} else {
|
|
int64_t prevDeltaX = deltaOfDelta + prevDelta[1];
|
|
ostream[opos++] = prevVal[1] + prevDeltaX;
|
|
}
|
|
}
|
|
|
|
#endif
|
|
return 0;
|
|
} |