init add

2021-06-24 19:58:45 +08:00 · 2021-06-24 19:58:45 +08:00 · afec60f40d
parent f2fce02bc0
commit afec60f40d
240 changed files with 150680 additions and 250 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,7 +2,7 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
 IF (CMAKE_VERSION VERSION_LESS 3.0)
  PROJECT(TDengine CXX)
  SET(PROJECT_VERSION_MAJOR "${LIB_MAJOR_VERSION}")
-  SET(PROJECT_VERSION_MINOR "${LIB_MINOR_VERSION}")
+  SET(PROJECT_VERSION_MINOR "${:_MINOR_VERSION}")
  SET(PROJECT_VERSION_PATCH "${LIB_PATCH_VERSION}")
  SET(PROJECT_VERSION "${LIB_VERSION_STRING}")
 ELSE ()
@ -43,11 +43,14 @@ INCLUDE(cmake/version.inc)
 INCLUDE(cmake/install.inc)

 IF (CMAKE_SYSTEM_NAME MATCHES "Linux")
-  SET(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -pipe -Wall -Wshadow -Werror")
-  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pipe -Wall -Wshadow -Werror")
+  SET(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -pipe -Wall ")
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pipe -Wall")
 ENDIF ()
 MESSAGE(STATUS "CMAKE_C_FLAGS:   ${CMAKE_C_FLAGS}")
 MESSAGE(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+MESSAGE(STATUS "COMMON_FLAGS:    ${COMMON_FLAGS}")
+
+

 ADD_SUBDIRECTORY(deps)
 ADD_SUBDIRECTORY(src)
--- a/cmake/define.inc
+++ b/cmake/define.inc
@ -57,7 +57,7 @@ IF (TD_LINUX_64)
  ADD_DEFINITIONS(-D_M_X64)
  ADD_DEFINITIONS(-D_TD_LINUX_64)
  MESSAGE(STATUS "linux64 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -fPIC -gdwarf-2 -msse4.2 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall -fPIC -gdwarf-2 -msse4.2 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
  ADD_DEFINITIONS(-DUSE_LIBICONV)

  IF (JEMALLOC_ENABLED)
@ -70,7 +70,7 @@ IF (TD_LINUX_32)
  ADD_DEFINITIONS(-D_TD_LINUX_32)
  ADD_DEFINITIONS(-DUSE_LIBICONV)
  MESSAGE(STATUS "linux32 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -fPIC -fsigned-char -munaligned-access -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall - -fPIC -fsigned-char -munaligned-access -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
 ENDIF ()

 IF (TD_ARM_64)
@ -78,7 +78,7 @@ IF (TD_ARM_64)
  ADD_DEFINITIONS(-D_TD_ARM_)
  ADD_DEFINITIONS(-DUSE_LIBICONV)
  MESSAGE(STATUS "arm64 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -fPIC -fsigned-char -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall - -fPIC -fsigned-char -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
 ENDIF ()

 IF (TD_ARM_32)
@ -86,7 +86,7 @@ IF (TD_ARM_32)
  ADD_DEFINITIONS(-D_TD_ARM_)
  ADD_DEFINITIONS(-DUSE_LIBICONV)
  MESSAGE(STATUS "arm32 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -fPIC -fsigned-char -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast -Wno-incompatible-pointer-types ")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall - -fPIC -fsigned-char -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast -Wno-incompatible-pointer-types ")
 ENDIF ()

 IF (TD_MIPS_64)
@ -94,7 +94,7 @@ IF (TD_MIPS_64)
  ADD_DEFINITIONS(-D_TD_MIPS_64)
  ADD_DEFINITIONS(-DUSE_LIBICONV)
  MESSAGE(STATUS "mips64 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -fPIC -fsigned-char -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall - -fPIC -fsigned-char -fpack-struct=8 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
 ENDIF ()

 IF (TD_MIPS_32)
@ -102,7 +102,7 @@ IF (TD_MIPS_32)
  ADD_DEFINITIONS(-D_TD_MIPS_32)
  ADD_DEFINITIONS(-DUSE_LIBICONV)
  MESSAGE(STATUS "mips32 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -fPIC -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall - -fPIC -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
 ENDIF ()

 IF (TD_APLHINE)
@ -147,7 +147,7 @@ IF (TD_DARWIN_64)
  ADD_DEFINITIONS(-D_REENTRANT -D__USE_POSIX -D_LIBC_REENTRANT)
  ADD_DEFINITIONS(-DUSE_LIBICONV)
  MESSAGE(STATUS "darwin64 is defined")
-  SET(COMMON_FLAGS "-std=gnu99 -Wall -Werror -Wno-missing-braces -fPIC -msse4.2 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
+  SET(COMMON_FLAGS "-std=gnu99 -Wall - -Wno-missing-braces -fPIC -msse4.2 -D_FILE_OFFSET_BITS=64 -D_LARGE_FILE")
  IF (TD_MEMORY_SANITIZER)
    SET(DEBUG_FLAGS "-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment -O0 -g3 -DDEBUG")
  ELSE ()
--- a/deps/CMakeLists.txt
+++ b/deps/CMakeLists.txt
@ -10,6 +10,8 @@ ADD_SUBDIRECTORY(cJson)
 ADD_SUBDIRECTORY(wepoll)
 ADD_SUBDIRECTORY(MsvcLibX)
 ADD_SUBDIRECTORY(rmonotonic)
+ADD_SUBDIRECTORY(SZ)
+

 IF (TD_LINUX AND TD_MQTT)
  ADD_SUBDIRECTORY(MQTT-C)
--- a/deps/SZ/.dockerignore
+++ b/deps/SZ/.dockerignore
@ -0,0 +1 @@
+build.*
--- a/deps/SZ/.gitignore
+++ b/deps/SZ/.gitignore
@ -0,0 +1,6 @@
+build
+compile_commands.json
+tags
+CMakeCache.txt
+cmake-build-debug/
+CMakeFiles/
--- a/deps/SZ/.travis.yml
+++ b/deps/SZ/.travis.yml
@ -0,0 +1,45 @@
+sudo: false
+
+language: c
+
+before_install:
+- cd test/travis-ci && ./getData.sh && cd -
+
+matrix:
+  include:
+    - dist: xenial
+      os: linux
+      addons:
+        apt:
+          sources:
+            - ubuntu-toolchain-r-test     # For gcc 4.9, 5 and 7
+          packages:
+            - gcc-7
+            - gfortran-7
+            - zstd
+            - libzstd1-dev
+            - exuberant-ctags
+            - libcunit1-dev 
+            - libnetcdf-dev 
+    - osx_image: xcode11
+      os: osx
+      env: PATH=/usr/local/bin:$PATH
+install:
+    - mkdir build
+    - cd build
+    - |
+      if [[ "${TRAVIS_OS_NAME}" != "linux" ]]; then
+        brew install ctags
+        brew install cunit
+        brew upgrade pkg-config
+      fi
+    - cmake -DCMAKE_INSTALL_PREFIX=$HOME -DBUILD_TESTS=ON -DBUILD_INTEGRATION_TESTS=ON ..
+    - make 
+    - make install
+    - make test
+
+script:
+- cd ..
+- ./configure && make
+- cd example && ./test.sh && cd -
+- cd test/travis-ci && ./test.sh && cd -
--- a/deps/SZ/CMakeLists.txt
+++ b/deps/SZ/CMakeLists.txt
@ -0,0 +1,23 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+PROJECT(TDengine)
+
+# include
+INCLUDE_DIRECTORIES(sz/include)
+INCLUDE_DIRECTORIES(zlib/)
+INCLUDE_DIRECTORIES(zstd/)
+
+# source
+AUX_SOURCE_DIRECTORY(sz/src           SRC1)
+AUX_SOURCE_DIRECTORY(zlib/            SRC2)
+AUX_SOURCE_DIRECTORY(zstd/common      SRC3)
+AUX_SOURCE_DIRECTORY(zstd/compress    SRC4)
+AUX_SOURCE_DIRECTORY(zstd/decompress  SRC5)
+AUX_SOURCE_DIRECTORY(zstd/deprecated  SRC6)
+AUX_SOURCE_DIRECTORY(zstd/legacy      SRC7)
+AUX_SOURCE_DIRECTORY(zstd/dictBuilder SRC8)
+
+# archive
+ADD_LIBRARY(SZ STATIC ${SRC1} ${SRC2} ${SRC3} ${SRC4} ${SRC5} ${SRC6} ${SRC7} ${SRC8})
+
+
+
--- a/deps/SZ/sz/CMakeLists.txt
+++ b/deps/SZ/sz/CMakeLists.txt
@ -0,0 +1,91 @@
+add_library (SZ
+  src/ArithmeticCoding.c
+  src/ByteToolkit.c
+  src/CacheTable.c
+  src/callZlib.c
+  src/CompressElement.c
+  src/conf.c
+  src/dataCompression.c
+  src/dictionary.c
+  src/DynamicByteArray.c
+  src/DynamicDoubleArray.c
+  src/DynamicFloatArray.c
+  src/DynamicIntArray.c
+  src/Huffman.c
+  src/iniparser.c
+  src/MultiLevelCacheTable.c
+  src/MultiLevelCacheTableWideInterval.c
+  src/pastri.c
+  src/exafelSZ.c
+  src/rw.c
+  src/rwf.c
+  src/sz.c
+  src/szd_double.c
+  src/szd_double_pwr.c
+  src/szd_double_ts.c
+  src/szd_float.c
+  src/szd_float_pwr.c
+  src/szd_float_ts.c
+  src/szd_int16.c
+  src/szd_int32.c
+  src/szd_int64.c
+  src/szd_int8.c
+  src/sz_double.c
+  src/sz_double_pwr.c
+  src/sz_double_ts.c
+  src/szd_uint16.c
+  src/szd_uint32.c
+  src/szd_uint64.c
+  src/szd_uint8.c
+  src/szf.c
+  src/sz_float.c
+  src/sz_float_pwr.c
+  src/sz_float_ts.c
+  src/sz_int16.c
+  src/sz_int32.c
+  src/sz_int64.c
+  src/sz_int8.c
+  src/sz_omp.c
+  src/sz_uint16.c
+  src/sz_uint32.c
+  src/sz_uint64.c
+  src/sz_uint8.c
+  src/TightDataPointStorageD.c
+  src/TightDataPointStorageF.c
+  src/TightDataPointStorageI.c
+  src/TypeManager.c
+  src/utility.c
+  src/VarSet.c
+  src/sz_stats.c
+)
+
+target_include_directories(SZ 
+  PUBLIC 
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/sz>
+  )
+
+
+target_compile_options(SZ
+	PRIVATE $<$<CONFIG:Debug>:-Wall -Wextra -Wpedantic -Wno-unused-parameter>
+	)
+
+if(BUILD_PASTRI)
+  target_compile_definitions(SZ PUBLIC HAVE_PASTRI)
+endif()
+if(BUILD_TIMECMPR)
+  target_compile_definitions(SZ PUBLIC HAVE_TIMECMPR)
+endif()
+if(BUILD_RANDOMACCESS)
+  target_compile_definitions(SZ PUBLIC HAVE_RANDOMACCESS)
+endif()
+if(BUILD_FORTRAN)
+  enable_language(Fortran)
+  target_sources(SZ PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/rw_interface.F90
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/sz_interface.F90
+  )
+endif()
+if(BUILD_STATS)
+  target_compile_definitions(SZ PUBLIC HAVE_WRITESTATS)
+endif()
--- a/deps/SZ/sz/Makefile.am
+++ b/deps/SZ/sz/Makefile.am
@ -0,0 +1,93 @@
+#AM_CFLAGS = -I./include -I../zlib
+#LDFLAGS=-fPIC -shared
+
+AUTOMAKE_OPTIONS=foreign
+if FORTRAN
+include_HEADERS=include/MultiLevelCacheTable.h include/MultiLevelCacheTableWideInterval.h include/CacheTable.h include/defines.h\
+		include/CompressElement.h include/DynamicDoubleArray.h include/rw.h include/conf.h include/dataCompression.h\
+		include/dictionary.h include/DynamicFloatArray.h include/VarSet.h include/sz.h include/Huffman.h include/ByteToolkit.h include/szf.h\
+		include/sz_float.h include/sz_double.h include/callZlib.h include/iniparser.h include/TypeManager.h\
+		include/sz_int8.h include/sz_int16.h include/sz_int32.h include/sz_int64.h include/szd_int8.h include/szd_int16.h include/szd_int32.h include/szd_int64.h\
+		include/sz_uint8.h include/sz_uint16.h include/sz_uint32.h include/sz_uint64.h include/szd_uint8.h include/szd_uint16.h include/szd_uint32.h include/szd_uint64.h\
+		include/sz_float_pwr.h include/sz_double_pwr.h include/szd_float.h include/szd_double.h include/szd_float_pwr.h include/szd_double_pwr.h\
+		include/sz_float_ts.h include/szd_float_ts.h include/sz_double_ts.h include/szd_double_ts.h include/utility.h include/sz_opencl.h\
+		include/DynamicByteArray.h include/DynamicIntArray.h include/TightDataPointStorageI.h include/TightDataPointStorageD.h include/TightDataPointStorageF.h\
+		include/pastriD.h include/pastriF.h include/pastriGeneral.h include/pastri.h include/exafelSZ.h include/ArithmeticCoding.h include/sz_omp.h include/sz_stats.h sz.mod rw.mod
+lib_LTLIBRARIES=libSZ.la
+libSZ_la_CFLAGS=-I./include -I../zlib/ -I../zstd/
+if TIMECMPR
+libSZ_la_CFLAGS+=-DHAVE_TIMECMPR
+endif
+if RANDOMACCESS
+libSZ_la_CFLAGS+=-DHAVE_RANDOMACCESS
+endif
+if OPENMP
+libSZ_la_CFLAGS+=-fopenmp
+endif
+libSZ_la_LDFLAGS = -version-info  2:1:0
+libSZ_la_LIDADD=../zlib/.libs/libzlib.a ../zstd/.libs/libzstd.a
+libSZ_la_SOURCES=src/MultiLevelCacheTable.c src/MultiLevelCacheTableWideInterval.c \
+		src/ByteToolkit.c src/dataCompression.c src/DynamicIntArray.c src/iniparser.c src/szf.c \
+		src/CompressElement.c src/DynamicByteArray.c src/rw.c src/utility.c\
+		src/TightDataPointStorageI.c src/TightDataPointStorageD.c src/TightDataPointStorageF.c \
+		src/conf.c src/DynamicDoubleArray.c src/rwf.c src/TypeManager.c \
+		src/dictionary.c src/DynamicFloatArray.c src/VarSet.c src/callZlib.c src/Huffman.c \
+		src/sz_float.c src/sz_double.c src/sz_int8.c src/sz_int16.c src/sz_int32.c src/sz_int64.c\
+		src/sz_uint8.c src/sz_uint16.c src/sz_uint32.c src/sz_uint64.c src/szd_uint8.c src/szd_uint16.c src/szd_uint32.c src/szd_uint64.c\
+		src/szd_float.c src/szd_double.c src/szd_int8.c src/szd_int16.c src/szd_int32.c src/szd_int64.c src/sz.c\
+		src/sz_float_pwr.c src/sz_double_pwr.c src/szd_float_pwr.c src/szd_double_pwr.c src/ArithmeticCoding.c src/CacheTable.c\
+		src/sz_interface.F90 src/rw_interface.F90 src/exafelSZ.c
+libSZ_la_LINK=$(AM_V_CC)$(LIBTOOL) --tag=FC --mode=link $(FCLD) $(libSZ_la_CFLAGS) -O3 $(libSZ_la_LDFLAGS) -o $(lib_LTLIBRARIES)
+else
+include_HEADERS=include/MultiLevelCacheTable.h include/MultiLevelCacheTableWideInterval.h include/CacheTable.h include/defines.h\
+		include/CompressElement.h include/DynamicDoubleArray.h include/rw.h include/conf.h include/dataCompression.h\
+		include/dictionary.h include/DynamicFloatArray.h include/VarSet.h include/sz.h include/Huffman.h include/ByteToolkit.h\
+		include/sz_float.h include/sz_double.h include/callZlib.h include/iniparser.h include/TypeManager.h\
+		include/sz_int8.h include/sz_int16.h include/sz_int32.h include/sz_int64.h include/szd_int8.h include/szd_int16.h include/szd_int32.h include/szd_int64.h\
+		include/sz_uint8.h include/sz_uint16.h include/sz_uint32.h include/sz_uint64.h include/szd_uint8.h include/szd_uint16.h include/szd_uint32.h include/szd_uint64.h\
+		include/sz_float_pwr.h include/sz_double_pwr.h include/szd_float.h include/szd_double.h include/szd_float_pwr.h include/szd_double_pwr.h\
+		include/sz_float_ts.h include/szd_float_ts.h include/sz_double_ts.h include/szd_double_ts.h include/utility.h include/sz_opencl.h\
+		include/DynamicByteArray.h include/DynamicIntArray.h include/TightDataPointStorageI.h include/TightDataPointStorageD.h include/TightDataPointStorageF.h\
+		include/pastriD.h include/pastriF.h include/pastriGeneral.h include/pastri.h include/exafelSZ.h include/ArithmeticCoding.h include/sz_omp.h include/sz_stats.h
+
+lib_LTLIBRARIES=libSZ.la
+libSZ_la_CFLAGS=-I./include -I../zlib -I../zstd/ 
+if WRITESTATS
+libSZ_la_CFLAGS+=-DHAVE_WRITESTATS
+endif
+if TIMECMPR
+libSZ_la_CFLAGS+=-DHAVE_TIMECMPR
+endif
+if RANDOMACCESS
+libSZ_la_CFLAGS+=-DHAVE_RANDOMACCESS
+endif
+if OPENMP
+libSZ_la_CFLAGS+=-fopenmp
+endif
+libSZ_la_LDFLAGS = -version-info  1:4:0
+libSZ_la_LIDADD=../zlib/.libs/libzlib.a ../zlib/.libs/libzstd.a
+libSZ_la_SOURCES=src/MultiLevelCacheTable.c src/MultiLevelCacheTableWideInterval.c \
+		src/ByteToolkit.c src/dataCompression.c src/DynamicIntArray.c src/iniparser.c\
+		src/CompressElement.c src/DynamicByteArray.c src/rw.c src/utility.c\
+		src/TightDataPointStorageI.c src/TightDataPointStorageD.c src/TightDataPointStorageF.c \
+		src/conf.c src/DynamicDoubleArray.c src/TypeManager.c \
+		src/dictionary.c src/DynamicFloatArray.c src/VarSet.c src/callZlib.c src/Huffman.c \
+		src/sz_float.c src/sz_double.c src/sz_int8.c src/sz_int16.c src/sz_int32.c src/sz_int64.c\
+		src/sz_uint8.c src/sz_uint16.c src/sz_uint32.c src/sz_uint64.c src/szd_uint8.c src/szd_uint16.c src/szd_uint32.c src/szd_uint64.c\
+		src/szd_float.c src/szd_double.c src/szd_int8.c src/szd_int16.c src/szd_int32.c src/szd_int64.c src/sz.c\
+		src/sz_float_pwr.c src/sz_double_pwr.c src/szd_float_pwr.c src/szd_double_pwr.c src/ArithmeticCoding.c src/exafelSZ.c src/CacheTable.c
+if PASTRI
+libSZ_la_SOURCES+=src/pastri.c
+endif
+if OPENMP
+libSZ_la_SOURCES+=src/sz_omp.c
+endif
+if TIMECMPR
+libSZ_la_SOURCES+=src/sz_float_ts.c src/szd_float_ts.c src/sz_double_ts.c src/szd_double_ts.c
+endif
+if WRITESTATS
+libSZ_la_SOURCES+=src/sz_stats.c
+endif
+
+libSZ_la_LINK= $(AM_V_CC)$(LIBTOOL) --tag=CC --mode=link $(CCLD) $(libSZ_la_CFLAGS) -O3 $(libSZ_la_LDFLAGS) -o $(lib_LTLIBRARIES)
+endif
--- a/deps/SZ/sz/Makefile.in
+++ b/deps/SZ/sz/Makefile.in
--- a/deps/SZ/sz/include/ArithmeticCoding.h
+++ b/deps/SZ/sz/include/ArithmeticCoding.h
@ -0,0 +1,62 @@
+/**
+ *  @file ArithmeticCoding.h
+ *  @author Sheng Di
+ *  @date Dec, 2018
+ *  @brief Header file for the ArithmeticCoding.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _ArithmeticCoding_H
+#define _ArithmeticCoding_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+#define ONE_FOURTH (0x40000000000) //44 bits are absolutely enough to deal with a large dataset (support at most 16TB per process)
+#define ONE_HALF (0x80000000000)
+#define THREE_FOURTHS (0xC0000000000)
+#define MAX_CODE (0xFFFFFFFFFFF)
+#define MAX_INTERVALS 1048576 //the limit to the arithmetic coding (at most 2^(20) intervals)
+
+typedef struct Prob {
+    size_t low;
+    size_t high;
+    int state;
+} Prob;
+
+typedef struct AriCoder
+{
+	int numOfRealStates; //the # real states menas the number of states after the optimization of # intervals
+	int numOfValidStates; //the # valid states means the number of non-zero frequency cells (some states/codes actually didn't appear)
+	size_t total_frequency;	
+	Prob* cumulative_frequency; //used to encode data more efficiencly
+} AriCoder;
+
+void output_bit_1(unsigned int* buf);
+void output_bit_0(unsigned int* buf);
+unsigned int output_bit_1_plus_pending(int pending_bits);
+unsigned int output_bit_0_plus_pending(int pending_bits);
+
+AriCoder *createAriCoder(int numOfStates, int *s, size_t length);
+void freeAriCoder(AriCoder *ariCoder);
+void ari_init(AriCoder *ariCoder, int *s, size_t length);
+unsigned int pad_ariCoder(AriCoder* ariCoder, unsigned char** out);
+int unpad_ariCoder(AriCoder** ariCoder, unsigned char* bytes);
+
+unsigned char get_bit(unsigned char* p, int offset);
+
+void ari_encode(AriCoder *ariCoder, int *s, size_t length, unsigned char *out, size_t *outSize);
+void ari_decode(AriCoder *ariCoder, unsigned char *s, size_t s_len, size_t targetLength, int *out);
+
+Prob* getCode(AriCoder *ariCoder, size_t scaled_value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _ArithmeticCoding_H  ----- */
+
--- a/deps/SZ/sz/include/ByteToolkit.h
+++ b/deps/SZ/sz/include/ByteToolkit.h
@ -0,0 +1,81 @@
+/**
+ *  @file ByteToolkit.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the ByteToolkit.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _ByteToolkit_H
+#define _ByteToolkit_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+//ByteToolkit.c
+
+unsigned short bytesToUInt16_bigEndian(unsigned char* bytes);
+unsigned int bytesToUInt32_bigEndian(unsigned char* bytes);
+unsigned long bytesToUInt64_bigEndian(unsigned char* b);
+
+short bytesToInt16_bigEndian(unsigned char* bytes);
+int bytesToInt32_bigEndian(unsigned char* bytes);
+long bytesToInt64_bigEndian(unsigned char* b);
+int bytesToInt_bigEndian(unsigned char* bytes);
+
+void intToBytes_bigEndian(unsigned char *b, unsigned int num);
+
+void int64ToBytes_bigEndian(unsigned char *b, uint64_t num);
+void int32ToBytes_bigEndian(unsigned char *b, uint32_t num);
+void int16ToBytes_bigEndian(unsigned char *b, uint16_t num);
+
+long bytesToLong_bigEndian(unsigned char* b);
+void longToBytes_bigEndian(unsigned char *b, unsigned long num);
+long doubleToOSEndianLong(double value);
+int floatToOSEndianInt(float value);
+short getExponent_float(float value);
+short getPrecisionReqLength_float(float precision);
+short getExponent_double(double value);
+short getPrecisionReqLength_double(double precision);
+unsigned char numberOfLeadingZeros_Int(int i);
+unsigned char numberOfLeadingZeros_Long(long i);
+unsigned char getLeadingNumbers_Int(int v1, int v2);
+unsigned char getLeadingNumbers_Long(long v1, long v2);
+short bytesToShort(unsigned char* bytes);
+void shortToBytes(unsigned char* b, short value);
+int bytesToInt(unsigned char* bytes);
+long bytesToLong(unsigned char* bytes);
+float bytesToFloat(unsigned char* bytes);
+void floatToBytes(unsigned char *b, float num);
+double bytesToDouble(unsigned char* bytes);
+void doubleToBytes(unsigned char *b, double num);
+int extractBytes(unsigned char* byteArray, size_t k, int validLength);
+int getMaskRightCode(int m);
+int getLeftMovingCode(int kMod8);
+int getRightMovingSteps(int kMod8, int resiBitLength);
+int getRightMovingCode(int kMod8, int resiBitLength);
+short* convertByteDataToShortArray(unsigned char* bytes, size_t byteLength);
+unsigned short* convertByteDataToUShortArray(unsigned char* bytes, size_t byteLength);
+
+void convertShortArrayToBytes(short* states, size_t stateLength, unsigned char* bytes);
+void convertUShortArrayToBytes(unsigned short* states, size_t stateLength, unsigned char* bytes);
+void convertIntArrayToBytes(int* states, size_t stateLength, unsigned char* bytes);
+void convertUIntArrayToBytes(unsigned int* states, size_t stateLength, unsigned char* bytes);
+void convertLongArrayToBytes(int64_t* states, size_t stateLength, unsigned char* bytes);
+void convertULongArrayToBytes(uint64_t* states, size_t stateLength, unsigned char* bytes);
+
+size_t bytesToSize(unsigned char* bytes);
+void sizeToBytes(unsigned char* outBytes, size_t size);
+
+void put_codes_to_output(unsigned int buf, int bitSize, unsigned char** p, int* lackBits, size_t *outSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _ByteToolkit_H  ----- */
+
--- a/deps/SZ/sz/include/CacheTable.h
+++ b/deps/SZ/sz/include/CacheTable.h
@ -0,0 +1,40 @@
+/**
+ *  @file CacheTable.h
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef SZ_MASTER_CACHETABLE_H
+#define SZ_MASTER_CACHETABLE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "stdio.h"
+#include "stdint.h"
+#include <math.h>
+
+extern double* g_CacheTable;
+extern uint32_t * g_InverseTable;
+extern uint32_t baseIndex;
+extern uint32_t topIndex;
+extern int bits;
+
+int doubleGetExpo(double d);
+int CacheTableGetRequiredBits(double precision, int quantization_intervals);
+uint32_t CacheTableGetIndex(float value, int bits);
+uint64_t CacheTableGetIndexDouble(double value, int bits);
+int CacheTableIsInBoundary(uint32_t index);
+void CacheTableBuild(double * table, int count, double smallest, double largest, double precision, int quantization_intervals);
+uint32_t CacheTableFind(uint32_t index);
+void CacheTableFree();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //SZ_MASTER_CACHETABLE_H
--- a/deps/SZ/sz/include/CompressElement.h
+++ b/deps/SZ/sz/include/CompressElement.h
@ -0,0 +1,76 @@
+/**
+ *  @file CompressElement.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Compress Elements such as DoubleCompressELement.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdint.h>
+
+#ifndef _CompressElement_H
+#define _CompressElement_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct DoubleValueCompressElement
+{
+	double data;
+	long curValue;
+	unsigned char curBytes[8]; //big_endian
+	int reqBytesLength;
+	int resiBitsLength;
+} DoubleValueCompressElement;
+
+typedef struct FloatValueCompressElement
+{
+	float data;
+	int curValue;
+	unsigned char curBytes[4]; //big_endian
+	int reqBytesLength;
+	int resiBitsLength;
+} FloatValueCompressElement;
+
+typedef struct LossyCompressionElement
+{
+	int leadingZeroBytes; //0,1,2,or 3
+	unsigned char integerMidBytes[8];
+	int integerMidBytes_Length; //they are mid_bits actually
+	//char curBytes[8];
+	//int curBytes_Length; //4 for single_precision or 8 for double_precision	
+	int resMidBitsLength;
+	int residualMidBits;
+} LossyCompressionElement;
+
+char* decompressGroupIDArray(unsigned char* bytes, size_t dataLength);
+
+short computeGroupNum_float(float value);
+short computeGroupNum_double(double value);
+
+void listAdd_double(double last3CmprsData[3], double value);
+void listAdd_float(float last3CmprsData[3], float value);
+void listAdd_int(int64_t last3CmprsData[3], int64_t value);
+void listAdd_int32(int32_t last3CmprsData[3], int32_t value);
+void listAdd_float_group(float *groups, int *flags, char groupNum, float oriValue, float decValue, char* curGroupID);
+void listAdd_double_group(double *groups, int *flags, char groupNum, double oriValue, double decValue, char* curGroupID);
+
+int validPrediction_double(double minErr, double precision);
+int validPrediction_float(float minErr, float precision);
+double* generateGroupErrBounds(int errorBoundMode, double realPrecision, double pwrErrBound);
+int generateGroupMaxIntervalCount(double* groupErrBounds);
+
+void new_LossyCompressionElement(LossyCompressionElement *lce, int leadingNum, unsigned char* intMidBytes, 
+		int intMidBytes_Length, int resiMidBitsLength, int resiBits);
+void updateLossyCompElement_Double(unsigned char* curBytes, unsigned char* preBytes, 
+		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce);
+void updateLossyCompElement_Float(unsigned char* curBytes, unsigned char* preBytes, 
+		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _CompressElement_H  ----- */
--- a/deps/SZ/sz/include/DynamicByteArray.h
+++ b/deps/SZ/sz/include/DynamicByteArray.h
@ -0,0 +1,36 @@
+/**
+ *  @file DynamicByteArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Byte Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicByteArray_H
+#define _DynamicByteArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicByteArray
+{	
+	unsigned char* array;
+	size_t size;
+	size_t capacity;
+} DynamicByteArray;
+
+void new_DBA(DynamicByteArray **dba, size_t cap);
+void convertDBAtoBytes(DynamicByteArray *dba, unsigned char** bytes);
+void free_DBA(DynamicByteArray *dba);
+unsigned char getDBA_Data(DynamicByteArray *dba, size_t pos);
+void addDBA_Data(DynamicByteArray *dba, unsigned char value);
+void memcpyDBA_Data(DynamicByteArray *dba, unsigned char* data, size_t length);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicByteArray_H  ----- */
--- a/deps/SZ/sz/include/DynamicDoubleArray.h
+++ b/deps/SZ/sz/include/DynamicDoubleArray.h
@ -0,0 +1,36 @@
+/**
+ *  @file DynamicDoubleArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Double Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicDoubleArray_H
+#define _DynamicDoubleArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+typedef struct DynamicDoubleArray
+{	
+	double* array;
+	size_t size;
+	double capacity;
+} DynamicDoubleArray;
+
+void new_DDA(DynamicDoubleArray **dda, size_t cap);
+void convertDDAtoDoubles(DynamicDoubleArray *dba, double **data);
+void free_DDA(DynamicDoubleArray *dda);
+double getDDA_Data(DynamicDoubleArray *dda, size_t pos);
+void addDDA_Data(DynamicDoubleArray *dda, double value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicDoubleArray_H  ----- */
--- a/deps/SZ/sz/include/DynamicFloatArray.h
+++ b/deps/SZ/sz/include/DynamicFloatArray.h
@ -0,0 +1,35 @@
+/**
+ *  @file DynamicFloatArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Float Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicFloatArray_H
+#define _DynamicFloatArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicFloatArray
+{	
+	float* array;
+	size_t size;
+	size_t capacity;
+} DynamicFloatArray;
+
+void new_DFA(DynamicFloatArray **dfa, size_t cap);
+void convertDFAtoFloats(DynamicFloatArray *dfa, float **data);
+void free_DFA(DynamicFloatArray *dfa);
+float getDFA_Data(DynamicFloatArray *dfa, size_t pos);
+void addDFA_Data(DynamicFloatArray *dfa, float value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicFloatArray_H  ----- */
--- a/deps/SZ/sz/include/DynamicIntArray.h
+++ b/deps/SZ/sz/include/DynamicIntArray.h
@ -0,0 +1,35 @@
+/**
+ *  @file DynamicIntArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Int Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicIntArray_H
+#define _DynamicIntArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicIntArray
+{	
+	unsigned char* array; //char* (one byte) is enough, don't have to be int*
+	size_t size;
+	size_t capacity;
+} DynamicIntArray;
+
+void new_DIA(DynamicIntArray **dia, size_t cap);
+void convertDIAtoInts(DynamicIntArray *dia, unsigned char **data);
+void free_DIA(DynamicIntArray *dia);
+int getDIA_Data(DynamicIntArray *dia, size_t pos);
+void addDIA_Data(DynamicIntArray *dia, int value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicIntArray_H  ----- */
--- a/deps/SZ/sz/include/Huffman.h
+++ b/deps/SZ/sz/include/Huffman.h
@ -0,0 +1,75 @@
+/**
+ *  @file Huffman.h
+ *  @author Sheng Di
+ *  @date Aug., 2016
+ *  @brief Header file for the exponential segment constructor.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _Huffman_H
+#define _Huffman_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//Note: when changing the following settings, intvCapacity in sz.h should be changed as well.
+//#define allNodes 131072
+//#define stateNum 65536
+
+typedef struct node_t {
+	struct node_t *left, *right;
+	size_t freq;
+	char t; //in_node:0; otherwise:1
+	unsigned int c;
+} *node;
+
+typedef struct HuffmanTree {
+	unsigned int stateNum;
+	unsigned int allNodes;
+	struct node_t* pool;
+	node *qqq, *qq; //the root node of the HuffmanTree is qq[1]
+	int n_nodes; //n_nodes is for compression
+	int qend; 
+	unsigned long **code;
+	unsigned char *cout;
+	int n_inode; //n_inode is for decompression
+	int maxBitCount;
+} HuffmanTree;
+
+HuffmanTree* createHuffmanTree(int stateNum);
+HuffmanTree* createDefaultHuffmanTree();
+
+node new_node(HuffmanTree *huffmanTree, size_t freq, unsigned int c, node a, node b);
+node new_node2(HuffmanTree *huffmanTree, unsigned int c, unsigned char t);
+void qinsert(HuffmanTree *huffmanTree, node n);
+node qremove(HuffmanTree *huffmanTree);
+void build_code(HuffmanTree *huffmanTree, node n, int len, unsigned long out1, unsigned long out2);
+void init(HuffmanTree *huffmanTree, int *s, size_t length);
+void init_static(HuffmanTree *huffmanTree, int *s, size_t length);
+void encode(HuffmanTree *huffmanTree, int *s, size_t length, unsigned char *out, size_t *outSize);
+
+void decode(unsigned char *s, size_t targetLength, node t, int *out);
+void decode_MSST19(unsigned char *s, size_t targetLength, node t, int *out, int maxBits);
+
+void pad_tree_uchar(HuffmanTree* huffmanTree, unsigned char* L, unsigned char* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+void pad_tree_ushort(HuffmanTree* huffmanTree, unsigned short* L, unsigned short* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+void pad_tree_uint(HuffmanTree* huffmanTree, unsigned int* L, unsigned int* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+unsigned int convert_HuffTree_to_bytes_anyStates(HuffmanTree* huffmanTree, int nodeCount, unsigned char** out);
+void unpad_tree_uchar(HuffmanTree* huffmanTree, unsigned char* L, unsigned char* R, unsigned int* C, unsigned char *t, unsigned int i, node root);
+void unpad_tree_ushort(HuffmanTree* huffmanTree, unsigned short* L, unsigned short* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+void unpad_tree_uint(HuffmanTree* huffmanTree, unsigned int* L, unsigned int* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+node reconstruct_HuffTree_from_bytes_anyStates(HuffmanTree *huffmanTree, unsigned char* bytes, int nodeCount);
+
+void encode_withTree(HuffmanTree* huffmanTree, int *s, size_t length, unsigned char **out, size_t *outSize);
+int encode_withTree_MSST19(HuffmanTree* huffmanTree, int *s, size_t length, unsigned char **out, size_t *outSize);
+void decode_withTree(HuffmanTree* huffmanTree, unsigned char *s, size_t targetLength, int *out);
+void decode_withTree_MSST19(HuffmanTree* huffmanTree, unsigned char *s, size_t targetLength, int *out, int maxBits);
+void SZ_ReleaseHuffman(HuffmanTree* huffmanTree);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/deps/SZ/sz/include/MultiLevelCacheTable.h
+++ b/deps/SZ/sz/include/MultiLevelCacheTable.h
@ -0,0 +1,50 @@
+/**
+ *  @file MultiLevelCacheTable.h
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _MULTILEVELCACHETABLE_H
+#define _MULTILEVELCACHETABLE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <memory.h>
+#include <stdlib.h>
+#include "stdio.h"
+
+typedef struct SubLevelTable{
+    uint32_t baseIndex;
+    uint32_t topIndex;
+    uint32_t* table;
+    uint8_t expoIndex;
+} SubLevelTable;
+
+typedef struct TopLevelTable{
+    uint8_t bits;
+    uint8_t baseIndex;
+    uint8_t topIndex;
+    struct SubLevelTable* subTables;
+    float bottomBoundary;
+    float topBoundary;
+} TopLevelTable;
+
+uint8_t MLCT_GetExpoIndex(float value);
+uint8_t MLCT_GetRequiredBits(float precision);
+uint32_t MLCT_GetMantiIndex(float value, int bits);
+float MLTC_RebuildFloat(uint8_t expo, uint32_t manti, int bits);
+void MultiLevelCacheTableBuild(struct TopLevelTable* topTable, float* precisionTable, int count, float precision);
+uint32_t MultiLevelCacheTableGetIndex(float value, struct TopLevelTable* topLevelTable);
+void MultiLevelCacheTableFree(struct TopLevelTable* table);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_MULTILEVELCACHETABLE_H
--- a/deps/SZ/sz/include/MultiLevelCacheTableWideInterval.h
+++ b/deps/SZ/sz/include/MultiLevelCacheTableWideInterval.h
@ -0,0 +1,54 @@
+/**
+ *  @file MultiLevelCacheTableWideInterval.h
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file for MultiLevelCacheTableWideInterval.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#ifndef _MULTILEVELCACHETABLEWIDEINTERVAL_H
+#define _MULTILEVELCACHETABLEWIDEINTERVAL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <memory.h>
+#include <stdlib.h>
+#include "stdio.h"
+
+typedef struct SubLevelTableWideInterval{
+    uint64_t baseIndex;
+    uint64_t topIndex;
+    uint16_t* table;
+    uint16_t expoIndex;
+} SubLevelTableWideInterval;
+
+typedef struct TopLevelTableWideInterval{
+    uint16_t bits;
+    uint16_t baseIndex;
+    uint16_t topIndex;
+    struct SubLevelTableWideInterval* subTables;
+    double bottomBoundary;
+    double topBoundary;
+} TopLevelTableWideInterval;
+
+void freeTopLevelTableWideInterval(struct TopLevelTableWideInterval* topTable);
+
+uint16_t MLCTWI_GetExpoIndex(double value);
+uint16_t MLCTWI_GetRequiredBits(double precision);
+uint64_t MLCTWI_GetMantiIndex(double value, int bits);
+
+double MLTCWI_RebuildDouble(uint16_t expo, uint64_t manti, int bits);
+void MultiLevelCacheTableWideIntervalBuild(struct TopLevelTableWideInterval* topTable, double* precisionTable, int count, double precision, int plus_bits);
+uint32_t MultiLevelCacheTableWideIntervalGetIndex(double value, struct TopLevelTableWideInterval* topLevelTable);
+void MultiLevelCacheTableWideIntervalFree(struct TopLevelTableWideInterval* table);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_MULTILEVELCACHETABLEWIDEINTERVAL_H
--- a/deps/SZ/sz/include/TightDataPointStorageD.h
+++ b/deps/SZ/sz/include/TightDataPointStorageD.h
@ -0,0 +1,99 @@
+/**
+ *  @file TightDataPointStorageD.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for the tight data point storage (TDPS).
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _TightDataPointStorageD_H
+#define _TightDataPointStorageD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TightDataPointStorageD
+{
+	size_t dataSeriesLength;
+	int allSameData;
+	double realPrecision;
+	double medianValue;
+	char reqLength;	
+	char radExpo; //used to compute reqLength based on segmented precisions in "pw_rel_compression"
+
+	double minLogValue;
+
+	int stateNum;
+	int allNodes;
+
+	size_t exactDataNum;
+	double reservedValue;
+	
+	unsigned char* rtypeArray;
+	size_t rtypeArray_size;
+	
+	unsigned char* typeArray; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	size_t typeArray_size;
+	
+	unsigned char* leadNumArray; //its size is exactDataNum/4 (or exactDataNum/4+1)
+	size_t leadNumArray_size;
+	
+	unsigned char* exactMidBytes;
+	size_t exactMidBytes_size;
+	
+	unsigned char* residualMidBits;
+	size_t residualMidBits_size;
+	
+	unsigned int intervals;
+	
+	unsigned char isLossless; //a mark to denote whether it's lossless compression (1 is yes, 0 is no)
+	
+	size_t segment_size;
+	
+	unsigned char* pwrErrBoundBytes;
+	int pwrErrBoundBytes_size;
+		
+	unsigned char* raBytes;
+	size_t raBytes_size;
+	
+	unsigned char plus_bits;
+	unsigned char max_bits;
+	
+} TightDataPointStorageD;
+
+void new_TightDataPointStorageD_Empty(TightDataPointStorageD **self);
+int new_TightDataPointStorageD_fromFlatBytes(TightDataPointStorageD **self, unsigned char* flatBytes, size_t flatBytesLength);
+
+void new_TightDataPointStorageD(TightDataPointStorageD **self, 
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char resiBitLength, 
+		double realPrecision, double medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo);
+
+void new_TightDataPointStorageD2(TightDataPointStorageD **self, 
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char* resiBitLength, size_t resiBitLengthSize,
+		double realPrecision, double medianValue, char reqLength, unsigned int intervals,
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo);
+
+void convertTDPStoBytes_double(TightDataPointStorageD* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte);
+void convertTDPStoBytes_double_reserve(TightDataPointStorageD* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte);
+void convertTDPStoFlatBytes_double(TightDataPointStorageD *tdps, unsigned char** bytes, size_t *size);
+void convertTDPStoFlatBytes_double_args(TightDataPointStorageD *tdps, unsigned char* bytes, size_t *size);
+
+void free_TightDataPointStorageD(TightDataPointStorageD *tdps);
+void free_TightDataPointStorageD2(TightDataPointStorageD *tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _TightDataPointStorageD_H  ----- */
--- a/deps/SZ/sz/include/TightDataPointStorageF.h
+++ b/deps/SZ/sz/include/TightDataPointStorageF.h
@ -0,0 +1,105 @@
+/**
+ *  @file TightDataPointStorageF.h
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief Header file for the tight data point storage (TDPS).
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _TightDataPointStorageF_H
+#define _TightDataPointStorageF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h> 
+
+typedef struct TightDataPointStorageF
+{
+	size_t dataSeriesLength;
+	int allSameData;
+	double realPrecision; //it's used as the pwrErrBoundRatio when errBoundMode==PW_REL
+	float medianValue;
+	char reqLength;
+	char radExpo; //used to compute reqLength based on segmented precisions in "pw_rel_compression"
+	
+	int stateNum;
+	int allNodes;
+	
+	size_t exactDataNum;
+	float reservedValue;
+	
+	unsigned char* rtypeArray;
+	size_t rtypeArray_size;
+	
+	float minLogValue;
+
+	unsigned char* typeArray; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	size_t typeArray_size;
+	
+	unsigned char* leadNumArray; //its size is exactDataNum/4 (or exactDataNum/4+1)
+	size_t leadNumArray_size;
+	
+	unsigned char* exactMidBytes;
+	size_t exactMidBytes_size;
+	
+	unsigned char* residualMidBits;
+	size_t residualMidBits_size;
+	
+	unsigned int intervals; //quantization_intervals
+	
+	unsigned char isLossless; //a mark to denote whether it's lossless compression (1 is yes, 0 is no)
+	
+	size_t segment_size;
+	
+	unsigned char* pwrErrBoundBytes;
+	int pwrErrBoundBytes_size;
+	
+	unsigned char* raBytes;
+	size_t raBytes_size;
+
+	unsigned char plus_bits;
+	unsigned char max_bits;
+	
+} TightDataPointStorageF;
+
+void new_TightDataPointStorageF_Empty(TightDataPointStorageF **self);
+int new_TightDataPointStorageF_fromFlatBytes(TightDataPointStorageF **self, unsigned char* flatBytes, size_t flatBytesLength);
+
+void new_TightDataPointStorageF(TightDataPointStorageF **self,
+		size_t dataSeriesLength, size_t exactDataNum,
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char resiBitLength,
+		double realPrecision, float medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo);
+
+/**
+ * This function is designed for first-version of the point-wise relative error bound (developed by Sheng Di for TPDS18 paper)
+ * 
+ * */
+void new_TightDataPointStorageF2(TightDataPointStorageF **self,
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char* resiBitLength, size_t resiBitLengthSize, 
+		double realPrecision, float medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo);
+
+void convertTDPStoBytes_float(TightDataPointStorageF* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte);
+void convertTDPStoBytes_float_reserve(TightDataPointStorageF* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte);
+void convertTDPStoFlatBytes_float(TightDataPointStorageF *tdps, unsigned char** bytes, size_t *size);
+void convertTDPStoFlatBytes_float_args(TightDataPointStorageF *tdps, unsigned char* bytes, size_t *size);
+
+void free_TightDataPointStorageF(TightDataPointStorageF *tdps);
+void free_TightDataPointStorageF2(TightDataPointStorageF *tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _TightDataPointStorageF_H  ----- */
--- a/deps/SZ/sz/include/TightDataPointStorageI.h
+++ b/deps/SZ/sz/include/TightDataPointStorageI.h
@ -0,0 +1,65 @@
+/**
+ *  @file TightDataPointStorageI.h
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2017
+ *  @brief Header file for the tight data point storage (TDPS).
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _TightDataPointStorageI_H
+#define _TightDataPointStorageI_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h> 
+
+typedef struct TightDataPointStorageI
+{
+	size_t dataSeriesLength;
+	int allSameData;
+	double realPrecision; //it's used as the pwrErrBoundRatio when errBoundMode==PW_REL
+	size_t exactDataNum;
+	long minValue;
+	int exactByteSize;
+	int dataTypeSize; //the size of data type, e.g., it's 4 when data type is int32_t
+	
+	int stateNum;
+	int allNodes;
+	
+	unsigned char* typeArray; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	size_t typeArray_size;
+	
+	unsigned char* exactDataBytes;
+	size_t exactDataBytes_size;
+	
+	unsigned int intervals; //quantization_intervals
+	
+	unsigned char isLossless; //a mark to denote whether it's lossless compression (1 is yes, 0 is no)
+
+} TightDataPointStorageI;
+
+int computeRightShiftBits(int exactByteSize, int dataType);
+int convertDataTypeSizeCode(int dataTypeSizeCode);
+int convertDataTypeSize(int dataTypeSize);
+
+void new_TightDataPointStorageI_Empty(TightDataPointStorageI **self);
+int new_TightDataPointStorageI_fromFlatBytes(TightDataPointStorageI **self, unsigned char* flatBytes, size_t flatBytesLength);
+void new_TightDataPointStorageI(TightDataPointStorageI **self,
+		size_t dataSeriesLength, size_t exactDataNum, int byteSize, 
+		int* type, unsigned char* exactDataBytes, size_t exactDataBytes_size,
+		double realPrecision, long minValue, int intervals, int dataType);
+
+void convertTDPStoBytes_int(TightDataPointStorageI* tdps, unsigned char* bytes, unsigned char sameByte);
+void convertTDPStoFlatBytes_int(TightDataPointStorageI *tdps, unsigned char** bytes, size_t *size);
+void convertTDPStoFlatBytes_int_args(TightDataPointStorageI *tdps, unsigned char* bytes, size_t *size);
+void free_TightDataPointStorageI(TightDataPointStorageI *tdps);
+void free_TightDataPointStorageI2(TightDataPointStorageI *tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _TightDataPointStorageI_H  ----- */
--- a/deps/SZ/sz/include/TypeManager.h
+++ b/deps/SZ/sz/include/TypeManager.h
@ -0,0 +1,40 @@
+/**
+ *  @file TypeManager.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the TypeManager.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _TypeManager_H
+#define _TypeManager_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+
+//TypeManager.c
+size_t convertIntArray2ByteArray_fast_1b(unsigned char* intArray, size_t intArrayLength, unsigned char **result);
+size_t convertIntArray2ByteArray_fast_1b_to_result(unsigned char* intArray, size_t intArrayLength, unsigned char *result);
+void convertByteArray2IntArray_fast_1b(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
+size_t convertIntArray2ByteArray_fast_2b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result);
+size_t convertIntArray2ByteArray_fast_2b_inplace(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char *result);
+void convertByteArray2IntArray_fast_2b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
+size_t convertIntArray2ByteArray_fast_3b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result);
+void convertByteArray2IntArray_fast_3b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
+int getLeftMovingSteps(size_t k, unsigned char resiBitLength);
+size_t convertIntArray2ByteArray_fast_dynamic(unsigned char* timeStepType, unsigned char resiBitLength, size_t nbEle, unsigned char **bytes);
+size_t convertIntArray2ByteArray_fast_dynamic2(unsigned char* timeStepType, unsigned char* resiBitLength, size_t resiBitLengthLength, unsigned char **bytes);
+int computeBitNumRequired(size_t dataLength);
+void decompressBitArraybySimpleLZ77(int** result, unsigned char* bytes, size_t bytesLength, size_t totalLength, int validLength);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _TypeManager_H  ----- */
+
--- a/deps/SZ/sz/include/VarSet.h
+++ b/deps/SZ/sz/include/VarSet.h
@ -0,0 +1,84 @@
+/**
+ *  @file VarSet.h
+ *  @author Sheng Di
+ *  @date July, 2016
+ *  @brief Header file for the Variable.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _VarSet_H
+#define _VarSet_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+typedef struct sz_multisteps
+{
+	char compressionType;
+	int predictionMode;
+	int lastSnapshotStep; //the previous snapshot step
+	unsigned int currentStep; //current time step of the execution/simulation
+	
+	//void* ori_data; //original data pointer, which serve as the key for retrieving hist_data
+	void* hist_data; //historical data in past time steps
+} sz_multisteps;
+
+typedef struct SZ_Variable
+{
+	unsigned char var_id;
+	char* varName;
+	char compressType; //102 means HZ; 101 means SZ 
+	int dataType; //SZ_FLOAT or SZ_DOUBLE
+	size_t r5;
+	size_t r4;
+	size_t r3;
+	size_t r2;
+	size_t r1;
+	int errBoundMode;
+	double absErrBound;
+	double relBoundRatio;
+	double pwRelBoundRatio;
+	void* data;
+	sz_multisteps *multisteps;
+	unsigned char* compressedBytes;
+	size_t compressedSize;
+	struct SZ_Variable* next;
+} SZ_Variable;
+
+typedef struct SZ_VarSet
+{
+	unsigned short count;
+	struct SZ_Variable *header;
+	struct SZ_Variable *lastVar;
+} SZ_VarSet;
+
+void free_Variable_keepOriginalData(SZ_Variable* v);
+void free_Variable_keepCompressedBytes(SZ_Variable* v);
+void free_Variable_all(SZ_Variable* v);
+void SZ_batchAddVar(int var_id, char* varName, int dataType, void* data, 
+			int errBoundMode, double absErrBound, double relBoundRatio, double pwRelBoundRatio,
+			size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+int SZ_batchDelVar_vset(SZ_VarSet* vset, char* varName);
+int SZ_batchDelVar(char* varName);
+int SZ_batchDelVar_ID_vset(SZ_VarSet* vset, int var_id);
+int SZ_batchDelVar_ID(int var_id);
+
+SZ_Variable* SZ_searchVar(char* varName);
+void* SZ_getVarData(char* varName, size_t *r5, size_t *r4, size_t *r3, size_t *r2, size_t *r1);
+
+void free_VarSet_vset(SZ_VarSet *vset, int mode);
+void SZ_freeVarSet(int mode);
+
+void free_multisteps(sz_multisteps* multisteps);
+int checkVarID(unsigned char cur_var_id, unsigned char* var_ids, int var_count);
+SZ_Variable* SZ_getVariable(int var_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _VarSet_H  ----- */
--- a/deps/SZ/sz/include/callZlib.h
+++ b/deps/SZ/sz/include/callZlib.h
@ -0,0 +1,44 @@
+/**
+ *  @file callZlib.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the callZlib.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _CallZlib_H
+#define _CallZlib_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define SZ_ZLIB_BUFFER_SIZE 1048576	
+#define SZ_ZLIB_BUFFER_SIZE 65536
+
+#include <stdio.h>
+
+int isZlibFormat(unsigned char magic1, unsigned char magic2);
+
+//callZlib.c
+unsigned long zlib_compress(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
+unsigned long zlib_compress2(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
+unsigned long zlib_compress3(unsigned char* data, unsigned long dataLength, unsigned char* compressBytes, int level);
+unsigned long zlib_compress4(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
+unsigned long zlib_compress5(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
+
+unsigned long zlib_uncompress4(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long zlib_uncompress5(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long zlib_uncompress(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long zlib_uncompress2(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long zlib_uncompress3(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+
+unsigned long zlib_uncompress65536bytes(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _CallZlib_H  ----- */
+
--- a/deps/SZ/sz/include/conf.h
+++ b/deps/SZ/sz/include/conf.h
@ -0,0 +1,37 @@
+/**
+ *  @file conf.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the conf.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _Conf_H
+#define _Conf_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+//conf.c
+void updateQuantizationInfo(int quant_intervals);
+int SZ_ReadConf(const char* sz_cfgFile);
+int SZ_LoadConf(const char* sz_cfgFile);
+int checkVersion(char* version);
+int computeVersion(int major, int minor, int revision);
+int checkVersion2(char* version);
+
+void initSZ_TSC();
+unsigned int roundUpToPowerOf2(unsigned int base);
+double computeABSErrBoundFromPSNR(double psnr, double threshold, double value_range);
+double computeABSErrBoundFromNORM_ERR(double normErr, size_t nbEle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _Conf_H  ----- */
+
--- a/deps/SZ/sz/include/dataCompression.h
+++ b/deps/SZ/sz/include/dataCompression.h
@ -0,0 +1,104 @@
+/**
+ *  @file dataCompression.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the dataCompression.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DataCompression_H
+#define _DataCompression_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "sz.h"
+#include <stdio.h>
+#include <stdbool.h>
+
+#define computeMinMax(data) \
+        for(i=1;i<size;i++)\
+        {\
+                data_ = data[i];\
+                if(min>data_)\
+                        min = data_;\
+                else if(max<data_)\
+                        max = data_;\
+        }\
+
+
+//dataCompression.c
+int computeByteSizePerIntValue(long valueRangeSize);
+long computeRangeSize_int(void* oriData, int dataType, size_t size, int64_t* valueRangeSize);
+double computeRangeSize_double(double* oriData, size_t size, double* valueRangeSize, double* medianValue);
+float computeRangeSize_float(float* oriData, size_t size, float* valueRangeSize, float* medianValue);
+float computeRangeSize_float_MSST19(float* oriData, size_t size, float* valueRangeSize, float* medianValue, unsigned char * signs, bool* positive, float* nearZero);
+double computeRangeSize_double_MSST19(double* oriData, size_t size, double* valueRangeSize, double* medianValue, unsigned char * signs, bool* positive, double* nearZero);
+
+double computeRangeSize_double_subblock(double* oriData, double* valueRangeSize, double* medianValue,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1);
+float computeRangeSize_float_subblock(float* oriData, float* valueRangeSize, float* medianValue,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1);
+double min_d(double a, double b);
+double max_d(double a, double b);
+float min_f(float a, float b);
+float max_f(float a, float b);
+double getRealPrecision_double(double valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+double getRealPrecision_float(float valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+double getRealPrecision_int(long valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+void symTransform_8bytes(unsigned char data[8]);
+void symTransform_2bytes(unsigned char data[2]);
+void symTransform_4bytes(unsigned char data[4]);
+
+void compressInt8Value(int8_t tgtValue, int8_t minValue, int byteSize, unsigned char* bytes);
+void compressInt16Value(int16_t tgtValue, int16_t minValue, int byteSize, unsigned char* bytes);
+void compressInt32Value(int32_t tgtValue, int32_t minValue, int byteSize, unsigned char* bytes);
+void compressInt64Value(int64_t tgtValue, int64_t minValue, int byteSize, unsigned char* bytes);
+
+void compressUInt8Value(uint8_t tgtValue, uint8_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt16Value(uint16_t tgtValue, uint16_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt32Value(uint32_t tgtValue, uint32_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt64Value(uint64_t tgtValue, uint64_t minValue, int byteSize, unsigned char* bytes);
+
+void compressSingleFloatValue(FloatValueCompressElement *vce, float tgtValue, float precision, float medianValue, 
+		int reqLength, int reqBytesLength, int resiBitsLength);
+void compressSingleFloatValue_MSST19(FloatValueCompressElement *vce, float tgtValue, float precision, int reqLength, int reqBytesLength, int resiBitsLength);
+void compressSingleDoubleValue(DoubleValueCompressElement *vce, double tgtValue, double precision, double medianValue, 
+		int reqLength, int reqBytesLength, int resiBitsLength);
+void compressSingleDoubleValue_MSST19(DoubleValueCompressElement *vce, double tgtValue, double precision, int reqLength, int reqBytesLength, int resiBitsLength);
+                              
+int compIdenticalLeadingBytesCount_double(unsigned char* preBytes, unsigned char* curBytes);
+int compIdenticalLeadingBytesCount_float(unsigned char* preBytes, unsigned char* curBytes);
+void addExactData(DynamicByteArray *exactMidByteArray, DynamicIntArray *exactLeadNumArray, 
+		DynamicIntArray *resiBitArray, LossyCompressionElement *lce);
+
+int getPredictionCoefficients(int layers, int dimension, int **coeff_array, int *status);
+
+int computeBlockEdgeSize_3D(int segmentSize);
+int computeBlockEdgeSize_2D(int segmentSize);
+int initRandomAccessBytes(unsigned char* raBytes);
+
+int generateLossyCoefficients_float(float* oriData, double precision, size_t nbEle, int* reqBytesLength, int* resiBitsLength, float* medianValue, float* decData);
+int compressExactDataArray_float(float* oriData, double precision, size_t nbEle, unsigned char** leadArray, unsigned char** midArray, unsigned char** resiArray, 
+int reqLength, int reqBytesLength, int resiBitsLength, float medianValue);
+
+void decompressExactDataArray_float(unsigned char* leadNum, unsigned char* exactMidBytes, unsigned char* residualMidBits, size_t nbEle, int reqLength, float medianValue, float** decData);
+
+int generateLossyCoefficients_double(double* oriData, double precision, size_t nbEle, int* reqBytesLength, int* resiBitsLength, double* medianValue, double* decData);
+int compressExactDataArray_double(double* oriData, double precision, size_t nbEle, unsigned char** leadArray, unsigned char** midArray, unsigned char** resiArray, 
+int reqLength, int reqBytesLength, int resiBitsLength, double medianValue);
+
+void decompressExactDataArray_double(unsigned char* leadNum, unsigned char* exactMidBytes, unsigned char* residualMidBits, size_t nbEle, int reqLength, double medianValue, double** decData);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DataCompression_H  ----- */
+
--- a/deps/SZ/sz/include/defines.h
+++ b/deps/SZ/sz/include/defines.h
@ -0,0 +1,106 @@
+/**
+ *  @file defines.h
+ *  @author Sheng Di
+ *  @date July, 2019
+ *  @brief Header file for the dataCompression.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_DEFINES_H
+#define _SZ_DEFINES_H
+
+#define SZ_VERNUM 0x0200
+#define SZ_VER_MAJOR 2
+#define SZ_VER_MINOR 1
+#define SZ_VER_BUILD 12
+#define SZ_VER_REVISION 0
+
+#define PASTRI 103
+#define HZ 102 //deprecated
+#define SZ 101
+#define SZ_Transpose 104
+
+//prediction mode of temporal dimension based compression
+#define SZ_PREVIOUS_VALUE_ESTIMATE 0
+
+#define MIN_NUM_OF_ELEMENTS 20 //if the # elements <= 20, skip the compression
+
+#define ABS 0
+#define REL 1
+#define VR_REL 1  //alternative name to REL
+#define ABS_AND_REL 2
+#define ABS_OR_REL 3
+#define PSNR 4
+#define NORM 5
+
+#define PW_REL 10
+#define ABS_AND_PW_REL 11
+#define ABS_OR_PW_REL 12
+#define REL_AND_PW_REL 13
+#define REL_OR_PW_REL 14
+
+#define SZ_FLOAT 0
+#define SZ_DOUBLE 1
+#define SZ_UINT8 2
+#define SZ_INT8 3
+#define SZ_UINT16 4
+#define SZ_INT16 5
+#define SZ_UINT32 6
+#define SZ_INT32 7
+#define SZ_UINT64 8
+#define SZ_INT64 9
+
+#define LITTLE_ENDIAN_DATA 0 //refers to the endian type of the data read from the disk
+#define BIG_ENDIAN_DATA 1 //big_endian (ppc, max, etc.) ; little_endian (x86, x64, etc.)
+
+#define LITTLE_ENDIAN_SYSTEM 0 //refers to the endian type of the system
+#define BIG_ENDIAN_SYSTEM 1
+
+#define DynArrayInitLen 1024
+
+#define MIN_ZLIB_DEC_ALLOMEM_BYTES 1000000
+
+//#define maxRangeRadius 32768
+//#define maxRangeRadius 1048576//131072
+
+#define SZ_BEST_SPEED 0
+#define SZ_BEST_COMPRESSION 1
+#define SZ_DEFAULT_COMPRESSION 2
+#define SZ_TEMPORAL_COMPRESSION 3
+
+#define SZ_NO_REGRESSION 0
+#define SZ_WITH_LINEAR_REGRESSION 1
+
+#define SZ_PWR_MIN_TYPE 0
+#define SZ_PWR_AVG_TYPE 1
+#define SZ_PWR_MAX_TYPE 2
+
+#define SZ_FORCE_SNAPSHOT_COMPRESSION 0
+#define SZ_FORCE_TEMPORAL_COMPRESSION 1
+#define SZ_PERIO_TEMPORAL_COMPRESSION 2
+
+//SUCCESS returning status
+#define SZ_SCES 0  //successful
+#define SZ_NSCS -1 //Not successful
+#define SZ_FERR -2 //Failed to open input file
+#define SZ_TERR -3 //wrong data type (should be only float or double)
+#define SZ_DERR -4 //dimension error
+#define SZ_MERR -5 //sz_mode error
+#define SZ_BERR -6 //bound-mode error (should be only ABS, REL, ABS_AND_REL, ABS_OR_REL, or PW_REL)
+
+#define SZ_MAINTAIN_VAR_DATA 0
+#define SZ_DESTROY_WHOLE_VARSET 1
+
+#define GROUP_COUNT 16 //2^{16}=65536
+	
+#define MetaDataByteLength 28
+#define MetaDataByteLength_double 36 //meta data length for double type
+	
+#define numOfBufferedSteps 1 //the number of time steps in the buffer	
+
+
+#define GZIP_COMPRESSOR 0 //i.e., ZLIB_COMPRSSOR
+#define ZSTD_COMPRESSOR 1
+
+#endif /* _SZ_DEFINES_H */
--- a/deps/SZ/sz/include/dictionary.h
+++ b/deps/SZ/sz/include/dictionary.h
@ -0,0 +1,172 @@
+
+/*-------------------------------------------------------------------------*/
+/**
+   @file    dictionary.h
+   @author  N. Devillard
+   @brief   Implements a dictionary for string variables.
+
+   This module implements a simple dictionary object, i.e. a list
+   of string/string associations. This object is useful to store e.g.
+   informations retrieved from a configuration file (ini files).
+*/
+/*--------------------------------------------------------------------------*/
+
+#ifndef _DICTIONARY_H_
+#define _DICTIONARY_H_
+
+/*---------------------------------------------------------------------------
+                                Includes
+ ---------------------------------------------------------------------------*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/*---------------------------------------------------------------------------
+                                New types
+ ---------------------------------------------------------------------------*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dictionary object
+
+  This object contains a list of string/string associations. Each
+  association is identified by a unique string key. Looking up values
+  in the dictionary is speeded up by the use of a (hopefully collision-free)
+  hash function.
+ */
+/*-------------------------------------------------------------------------*/
+typedef struct _dictionary_ {
+    int             n ;     /** Number of entries in dictionary */
+    int             size ;  /** Storage size */
+    char        **  val ;   /** List of string values */
+    char        **  key ;   /** List of string keys */
+    unsigned     *  hash ;  /** List of hash values for keys */
+} dictionary ;
+
+
+/*---------------------------------------------------------------------------
+                            Function prototypes
+ ---------------------------------------------------------------------------*/
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Compute the hash key for a string.
+  @param    key     Character string to use for key.
+  @return   1 unsigned int on at least 32 bits.
+
+  This hash function has been taken from an Article in Dr Dobbs Journal.
+  This is normally a collision-free function, distributing keys evenly.
+  The key is stored anyway in the struct so that collision can be avoided
+  by comparing the key itself in last resort.
+ */
+/*--------------------------------------------------------------------------*/
+unsigned dictionary_hash(const char * key);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Create a new dictionary object.
+  @param    size    Optional initial size of the dictionary.
+  @return   1 newly allocated dictionary objet.
+
+  This function allocates a new dictionary object of given size and returns
+  it. If you do not know in advance (roughly) the number of entries in the
+  dictionary, give size=0.
+ */
+/*--------------------------------------------------------------------------*/
+dictionary * dictionary_new(int size);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete a dictionary object
+  @param    d   dictionary object to deallocate.
+  @return   void
+
+  Deallocate a dictionary object and all memory associated to it.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_del(dictionary * vd);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get a value from a dictionary.
+  @param    d       dictionary object to search.
+  @param    key     Key to look for in the dictionary.
+  @param    def     Default value to return if key not found.
+  @return   1 pointer to internally allocated character string.
+
+  This function locates a key in a dictionary and returns a pointer to its
+  value, or the passed 'def' pointer if no such key can be found in
+  dictionary. The returned character pointer points to data internal to the
+  dictionary object, you should not try to free it or modify it.
+ */
+/*--------------------------------------------------------------------------*/
+char * dictionary_get(dictionary * d, const char * key, char * def);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Set a value in a dictionary.
+  @param    d       dictionary object to modify.
+  @param    key     Key to modify or add.
+  @param    val     Value to add.
+  @return   int     0 if Ok, anything else otherwise
+
+  If the given key is found in the dictionary, the associated value is
+  replaced by the provided one. If the key cannot be found in the
+  dictionary, it is added to it.
+
+  It is Ok to provide a NULL value for val, but NULL values for the dictionary
+  or the key are considered as errors: the function will return immediately
+  in such a case.
+
+  Notice that if you dictionary_set a variable to NULL, a call to
+  dictionary_get will return a NULL value: the variable will be found, and
+  its value (NULL) is returned. In other words, setting the variable
+  content to NULL is equivalent to deleting the variable from the
+  dictionary. It is not possible (in this implementation) to have a key in
+  the dictionary without value.
+
+  This function returns non-zero in case of failure.
+ */
+/*--------------------------------------------------------------------------*/
+int dictionary_set(dictionary * vd, const char * key, const char * val);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete a key in a dictionary
+  @param    d       dictionary object to modify.
+  @param    key     Key to remove.
+  @return   void
+
+  This function deletes a key in a dictionary. Nothing is done if the
+  key cannot be found.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_unset(dictionary * d, const char * key);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dump a dictionary to an opened file pointer.
+  @param    d   Dictionary to dump
+  @param    f   Opened file pointer.
+  @return   void
+
+  Dumps a dictionary onto an opened file pointer. Key pairs are printed out
+  as @c [Key]=[Value], one per line. It is Ok to provide stdout or stderr as
+  output file pointers.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_dump(dictionary * d, FILE * out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/deps/SZ/sz/include/exafelSZ.h
+++ b/deps/SZ/sz/include/exafelSZ.h
@ -0,0 +1,57 @@
+#ifndef EXAFELSZ_H
+#define EXAFELSZ_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+typedef struct exafelSZ_params{
+  //uint8_t *peaks;
+  uint16_t *peaksSegs;
+  uint16_t *peaksRows;
+  uint16_t *peaksCols;
+  uint64_t numPeaks;
+
+  uint8_t *calibPanel;
+
+  uint8_t binSize; //Binning: (pr->binSize x pr->binSize) to (1 x 1)
+  double tolerance; //SZ pr->tolerance
+  uint8_t szDim; //1D/2D/3D compression/decompression
+  //uint8_t szBlockSize; //Currently unused
+  uint8_t peakSize; //MUST BE ODD AND NOT EVEN! Each peak will have size of: (peakSize x peakSize)
+ 
+  // uint64_t nEvents;
+  // uint64_t panels;
+  // uint64_t rows;
+  // uint64_t cols;
+  
+  //CALCULATED VARIBALES:
+  uint64_t binnedRows;
+  uint64_t binnedCols;
+  uint8_t peakRadius; //Will be calculated using peakSize
+
+} exafelSZ_params;
+
+
+void exafelSZ_params_process(exafelSZ_params*pr, size_t panels, size_t rows, size_t cols);
+void exafelSZ_params_checkDecomp(exafelSZ_params*pr, size_t panels, size_t rows, size_t cols);
+void exafelSZ_params_checkComp(exafelSZ_params*pr, size_t panels, size_t rows, size_t cols);
+
+unsigned char * exafelSZ_Compress(void* _pr,
+                         void* _origData,
+                        size_t events, size_t panels, size_t rows, size_t cols,
+                        size_t *compressedSize);
+					   
+void* exafelSZ_Decompress(void *_pr,
+                         unsigned char*_compressedBuffer,
+                         size_t events, size_t panels, size_t rows, size_t cols,
+                         size_t compressedSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _EXAFELSZ_H  ----- */
+
--- a/deps/SZ/sz/include/iniparser.h
+++ b/deps/SZ/sz/include/iniparser.h
@ -0,0 +1,321 @@
+
+/*-------------------------------------------------------------------------*/
+/**
+   @file    iniparser.h
+   @author  N. Devillard
+   @brief   Parser for ini files.
+*/
+/*--------------------------------------------------------------------------*/
+
+#ifndef _INIPARSER_H_
+#define _INIPARSER_H_
+
+/*---------------------------------------------------------------------------
+                                Includes
+ ---------------------------------------------------------------------------*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * The following #include is necessary on many Unixes but not Linux.
+ * It is not needed for Windows platforms.
+ * Uncomment it if needed.
+ */
+/* #include <unistd.h> */
+
+#include "dictionary.h"
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get number of sections in a dictionary
+  @param    d   Dictionary to examine
+  @return   int Number of sections found in dictionary
+
+  This function returns the number of sections found in a dictionary.
+  The test to recognize sections is done on the string stored in the
+  dictionary: a section name is given as "section" whereas a key is
+  stored as "section:key", thus the test looks for entries that do not
+  contain a colon.
+
+  This clearly fails in the case a section name contains a colon, but
+  this should simply be avoided.
+
+  This function returns -1 in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+
+int iniparser_getnsec(dictionary * d);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get name for section n in a dictionary.
+  @param    d   Dictionary to examine
+  @param    n   Section number (from 0 to nsec-1).
+  @return   Pointer to char string
+
+  This function locates the n-th section in a dictionary and returns
+  its name as a pointer to a string statically allocated inside the
+  dictionary. Do not free or modify the returned string!
+
+  This function returns NULL in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+
+char * iniparser_getsecname(dictionary * d, int n);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Save a dictionary to a loadable ini file
+  @param    d   Dictionary to dump
+  @param    f   Opened file pointer to dump to
+  @return   void
+
+  This function dumps a given dictionary into a loadable ini file.
+  It is Ok to specify @c stderr or @c stdout as output files.
+ */
+/*--------------------------------------------------------------------------*/
+
+void iniparser_dump_ini(dictionary * d, FILE * f);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Save a dictionary section to a loadable ini file
+  @param    d   Dictionary to dump
+  @param    s   Section name of dictionary to dump
+  @param    f   Opened file pointer to dump to
+  @return   void
+
+  This function dumps a given section of a given dictionary into a loadable ini
+  file.  It is Ok to specify @c stderr or @c stdout as output files.
+ */
+/*--------------------------------------------------------------------------*/
+
+void iniparser_dumpsection_ini(dictionary * d, char * s, FILE * f);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dump a dictionary to an opened file pointer.
+  @param    d   Dictionary to dump.
+  @param    f   Opened file pointer to dump to.
+  @return   void
+
+  This function prints out the contents of a dictionary, one element by
+  line, onto the provided file pointer. It is OK to specify @c stderr
+  or @c stdout as output files. This function is meant for debugging
+  purposes mostly.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_dump(dictionary * d, FILE * f);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the number of keys in a section of a dictionary.
+  @param    d   Dictionary to examine
+  @param    s   Section name of dictionary to examine
+  @return   Number of keys in section
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getsecnkeys(dictionary * d, char * s);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the number of keys in a section of a dictionary.
+  @param    d   Dictionary to examine
+  @param    s   Section name of dictionary to examine
+  @return   pointer to statically allocated character strings
+
+  This function queries a dictionary and finds all keys in a given section.
+  Each pointer in the returned char pointer-to-pointer is pointing to
+  a string allocated in the dictionary; do not free or modify them.
+
+  This function returns NULL in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+char ** iniparser_getseckeys(dictionary * d, char * s);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key
+  @param    d       Dictionary to search
+  @param    key     Key string to look for
+  @param    def     Default value to return if key not found.
+  @return   pointer to statically allocated character string
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the pointer passed as 'def' is returned.
+  The returned char pointer is pointing to a string allocated in
+  the dictionary, do not free or modify it.
+ */
+/*--------------------------------------------------------------------------*/
+char * iniparser_getstring(dictionary * d, const char * key, char * def);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to an int
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   integer
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+
+  Supported values for integers include the usual C notation
+  so decimal, octal (starting with 0) and hexadecimal (starting with 0x)
+  are supported. Examples:
+
+  - "42"      ->  42
+  - "042"     ->  34 (octal -> decimal)
+  - "0x42"    ->  66 (hexa  -> decimal)
+
+  Warning: the conversion may overflow in various ways. Conversion is
+  totally outsourced to strtol(), see the associated man page for overflow
+  handling.
+
+  Credits: Thanks to A. Becker for suggesting strtol()
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getint(dictionary * d, const char * key, int notfound);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a long
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   long
+
+  Credits: This function bases completely on int iniparser_getint and was
+  slightly modified to return long instead of int.
+ */
+/*--------------------------------------------------------------------------*/
+long iniparser_getlint(dictionary * d, const char * key, int notfound);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a double
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   double
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+ */
+/*--------------------------------------------------------------------------*/
+double iniparser_getdouble(dictionary * d, const char * key, double notfound);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a boolean
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   integer
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+
+  A true boolean is found if one of the following is matched:
+
+  - A string starting with 'y'
+  - A string starting with 'Y'
+  - A string starting with 't'
+  - A string starting with 'T'
+  - A string starting with '1'
+
+  A false boolean is found if one of the following is matched:
+
+  - A string starting with 'n'
+  - A string starting with 'N'
+  - A string starting with 'f'
+  - A string starting with 'F'
+  - A string starting with '0'
+
+  The notfound value returned if no boolean is identified, does not
+  necessarily have to be 0 or 1.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getboolean(dictionary * d, const char * key, int notfound);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Set an entry in a dictionary.
+  @param    ini     Dictionary to modify.
+  @param    entry   Entry to modify (entry name)
+  @param    val     New value to associate to the entry.
+  @return   int 0 if Ok, -1 otherwise.
+
+  If the given entry can be found in the dictionary, it is modified to
+  contain the provided value. If it cannot be found, -1 is returned.
+  It is Ok to set val to NULL.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_set(dictionary * ini, const char * entry, const char * val);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete an entry in a dictionary
+  @param    ini     Dictionary to modify
+  @param    entry   Entry to delete (entry name)
+  @return   void
+
+  If the given entry can be found, it is deleted from the dictionary.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_unset(dictionary * ini, const char * entry);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Finds out if a given entry exists in a dictionary
+  @param    ini     Dictionary to search
+  @param    entry   Name of the entry to look for
+  @return   integer 1 if entry exists, 0 otherwise
+
+  Finds out if a given entry exists in the dictionary. Since sections
+  are stored as keys with NULL associated values, this is the only way
+  of querying for the presence of sections in a dictionary.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_find_entry(dictionary * ini, const char * entry) ;
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Parse an ini file and return an allocated dictionary object
+  @param    ininame Name of the ini file to read.
+  @return   Pointer to newly allocated dictionary
+
+  This is the parser for ini files. This function is called, providing
+  the name of the file to be read. It returns a dictionary object that
+  should not be accessed directly, but through accessor functions
+  instead.
+
+  The returned dictionary must be freed using iniparser_freedict().
+ */
+/*--------------------------------------------------------------------------*/
+dictionary * iniparser_load(const char * ininame);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Free all memory associated to an ini dictionary
+  @param    d Dictionary to free
+  @return   void
+
+  Free all memory associated to an ini dictionary.
+  It is mandatory to call this function before the dictionary object
+  gets out of the current context.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_freedict(dictionary * d);
+
+#endif
--- a/deps/SZ/sz/include/pastri.h
+++ b/deps/SZ/sz/include/pastri.h
@ -0,0 +1,140 @@
+//CHECK:
+//What happens when ECQBits==1, or ECQBits==0 or ECQBits<0?
+//Rounding? Scale originalEb by 0.99?
+
+//Possible improvement: Change GAMESS format: {i i i i d} -> {i}{i}{i}{i}{d}
+//Possible improvement: Optimize bookkeeping bits
+//Possible improvement: Guess the type (C/UC, Sparse/Not)
+//Possible improvement: Get rid of writing/reading some of the indexes to in/out buffers
+//Possible improvement: Get rid of all debug stuff, including Makefile debug flags
+//Possible improvement: Get rid of "compressedBytes"
+//Possible improvement: SparseCompressed, ECQBits=2: 1's and -1's can be represented by just 0 and 1, instead 10 and 11. 
+//Possible improvement: SparseCompressed, ECQBits>2: Again: 1: 10, -1:11, Others: 0XX...XX 
+//Possible improvement: WriteBitsFast: maybe remove some masks?
+//Possible improvement: WriteBitsFast: Get rid of multiple calls!
+//Possible improvement: UCSparse: Indexes use 64 bits. It can be lowered to _1DIdxBits
+//Possible improvement: Parameters: Smaller data sizes may be possible!
+
+
+
+#ifndef PASTRI_H
+#define PASTRI_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h> //Just for debugging purposes!
+
+//#define DATASIZE 8 //Bytes per input data point.
+//We have only 1 double per data point, so it is 8 bytes.
+
+#define MAX_PS_SIZE 100
+#define MAX_BLOCK_SIZE 10000
+#define MAX_BUFSIZE 160000  //Should be a multiple of 8
+#define D_W 0 //Debug switch: Write (input block)
+#define D_R 0 //Debug switch: Read (compressed block)
+#define D_G 0 //Debug switch: General
+#define D_G2 0 //Debug switch: General 2 (a little more detail)
+#define D_C 0 //Debug switch: C
+//#define DEBUG 1 //Debug switch
+
+//#define BOOKKEEPINGBITS 0 //Currently unused
+//#define BOOKKEEPINGBITS 120 //Includes: mode, indexOffsets, compressedBytes, Pb_, ECQBits_ (8+64+32+8+8) 
+//BOOKKEEPINGBITS is defined here, because if P & S is going to be used, they appear just after the bookkeeping part.
+//This allows us to write P and S directly onto using outBuf.
+  
+
+// IMPORTANT NOTE:
+//Read/Write up to 56 bits.
+//More than that is not supported!
+
+
+/********************************************************************/
+//Datatype Declarations:
+/********************************************************************/
+typedef struct pastri_params{
+  double originalEb; //Error Bound entered by the user
+  double usedEb; //Error Bound used during compression/deceompression
+  
+  int numBlocks; //Number of blocks to be compressed
+  int dataSize; //8(=Double) or 4(=Float)
+  
+  int bf[4]; //Orbital types (basis function types). Typically in range [0,3]
+  int idxRange[4];  //Ranges of indexes. idxRange[i]=(bf[i]+1)*(bf[i]+2)/2;
+  
+  int sbSize; //=idxRange[2]*idxRange[3];
+  int sbNum;  //=idxRange[0]*idxRange[1];
+  int bSize; //=sbSize*sbNum;
+  
+  //uint16_t idxOffset[4]; //Index offset values. No longer used.
+  
+}pastri_params;
+
+//Block-specific stuff:
+typedef struct pastri_blockParams{
+  uint16_t nonZeros;
+  //int ECQ0s; //= p->bSize - numOutliers //OR: p->bSize=ECQ0s+ECQ1s+ECQOthers
+  int ECQ1s;
+  int ECQOthers;
+  int numOutliers; //=ECQ1s+ECQOthers
+  int patternBits;
+  int scaleBits;
+  double binSize;
+  double scalesBinSize;
+  uint64_t ECQExt;
+  int ECQBits;
+  int _1DIdxBits;
+}pastri_blockParams;
+
+typedef union u_UI64I64D{
+  uint64_t ui64;
+  int64_t i64;
+  double d;
+} u_UI64I64D;
+
+/********************************************************************/
+//Function Prototypes:
+/********************************************************************/
+void SZ_pastriReadParameters(char paramsFilename[512],pastri_params *paramsPtr);
+//Read the basic PaSTRI parameters from a file, speficied by paramsFilename.
+
+void SZ_pastriPreprocessParameters(pastri_params *p);
+//Using basic PaSTRI parameters, generate the others.
+//For example, block and sub-block sizes are generated by using basis function types.
+
+void SZ_pastriCompressBatch(pastri_params *p,unsigned char *originalBuf, unsigned char** compressedBufP,size_t *compressedBytes);
+//INPUTS: p, originalBuf
+//OUTPUTS: compressedBufP, compressedBytes
+//Using the inputs, compressedBufP is allocated and populated by the compressed data. Compressed size is written into compressedBytes.
+//Parameters are also stored at the beginning part of the compressedBuf
+
+void SZ_pastriDecompressBatch(unsigned char*compressedBuf, pastri_params *p, unsigned char** decompressedBufP ,size_t *decompressedBytes);
+//INPUTS: compressedBuf
+//OUTPUTS: p, decompressedBufP, decompressedBytes
+//First, parameters are read from compressedBuf and written into p.
+//Then, decompressedBufP is allocated and populated by the decompressed data. Decompressed size is written into decompressedBytes.
+
+void SZ_pastriCheckBatch(pastri_params *p,unsigned char*originalBuf,unsigned char*decompressedBuf); 
+//INPUTS: p, originalBuf, decompressedBuf
+//OUTPUTS: None (Just some on-screen messages)
+//Compares originalBuf with decompressedBuf. Checks whether the absolute error condition is satisfied or not.
+
+/********************************************************************/
+//Other Includes:
+/********************************************************************/
+
+
+
+#include "pastriGeneral.h"  //General tools
+#include "pastriD.h"  //Compression/Decompression for Double data
+#include "pastriF.h"  //Compression/Decompression for Float data
+
+
+#endif
+
+
+
+
+
--- a/deps/SZ/sz/include/pastriD.h
+++ b/deps/SZ/sz/include/pastriD.h
@ -0,0 +1,911 @@
+#ifndef PASTRID_H
+#define PASTRID_H
+
+static inline int64_t pastri_double_quantize(double x, double binSize){
+  //Add or sub 0.5, depending on the sign:
+  x=x/binSize;
+  
+  u_UI64I64D u1,half;
+  u1.d=x;
+  
+  half.d=0.5;
+  
+//  //printf("pastri_double_quantize:\nx=%lf  x=0x%lx\n",x,(*((uint64_t *)(&x))));
+//  //printf("sign(x):0x%lx\n", x);
+//  //printf("0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  half.ui64 |= (u1.ui64 & (uint64_t)0x8000000000000000);
+//  //printf("sign(x)*0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  return (int64_t)(x + half.d);
+}
+
+static inline void pastri_double_PatternMatch(double*data,pastri_params* p,pastri_blockParams* bp,int64_t* patternQ,int64_t *scalesQ, int64_t* ECQ){
+  //Find the pattern.
+  //First, find the extremum point:
+  double absExt=0; //Absolute value of Extremum
+  int extIdx=-1; //Index of Extremum
+  bp->nonZeros=0;
+  int i,sb;
+  for(i=0;i<p->bSize;i++){
+//    //printf("data[%d] = %.16lf\n",i,data[i]);//DEBUG
+    if(abs_FastD(data[i])>p->usedEb){
+      bp->nonZeros++;
+      ////if(DEBUG)printf("data[%d]:%.6e\n",i,data[i]); //DEBUG
+    }
+    if(abs_FastD(data[i])>absExt){
+      absExt=abs_FastD(data[i]);
+      extIdx=i;
+    }
+  }
+  int patternIdx; //Starting Index of Pattern
+  patternIdx=(extIdx/p->sbSize)*p->sbSize;
+  
+  double patternExt=data[extIdx];
+  bp->binSize=2*p->usedEb;
+  
+  ////if(DEBUG){printf("Extremum  : data[%d] = %.6e\n",extIdx,patternExt);} //DEBUG
+  ////if(DEBUG){printf("patternIdx: %d\n",patternIdx);} //DEBUG
+  
+  ////if(DEBUG){for(i=0;i<p->sbSize;i++){printf("pattern[%d]=data[%d]=%.6e Quantized:%d\n",i,patternIdx+i,data[patternIdx+i],pastri_double_quantize(data[patternIdx+i]/binSize)  );}   }//DEBUG
+  
+  //int64_t *patternQ=(int64_t*)(outBuf+15);  //Possible Improvement!
+
+  
+  for(i=0;i<p->sbSize;i++){
+    patternQ[i]=pastri_double_quantize(data[patternIdx+i],bp->binSize);
+    //if(D_W){printf("patternQ[%d]=%ld\n",i,patternQ[i]);}
+  }
+  
+  bp->patternBits=bitsNeeded_double((abs_FastD(patternExt)/bp->binSize)+1)+1;
+  bp->scaleBits=bp->patternBits;
+  bp->scalesBinSize=1/(double)(((uint64_t)1<<(bp->scaleBits-1))-1);
+  ////if(DEBUG){printf("(patternExt/binSize)+1: %.6e\n",(patternExt/binSize)+1);} //DEBUG
+  ////if(DEBUG){printf("scaleBits=patternBits: %d\n",scaleBits);} //DEBUG
+  //if(D_W){printf("scalesBinSize: %.6e\n",bp->scalesBinSize);} //DEBUG
+  
+  //Calculate Scales.
+  //The index part of the input buffer will be reused to hold Scale, Pattern, etc. values.
+  int localExtIdx=extIdx%p->sbSize; //Local extremum index. This is not the actual extremum of the current sb, but rather the index that correspond to the global (block) extremum.
+  //int64_t *scalesQ=(int64_t*)(outBuf+15+p->sbSize*8);  //Possible Improvement!
+  int patternExtZero=(patternExt==0);
+  ////if(DEBUG){printf("patternExtZero: %d\n",patternExtZero);} //DEBUG
+  for(sb=0;sb<p->sbNum;sb++){
+    //scales[sb]=data[sb*p->sbSize+localExtIdx]/patternExt;
+    //scales[sb]=patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt;
+    //assert(scales[sb]<=1);
+    scalesQ[sb]=pastri_double_quantize((patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt),bp->scalesBinSize);
+    //if(D_W){printf("scalesQ[%d]=%ld\n",sb,scalesQ[sb]);}
+  }
+  ////if(DEBUG){for(i=0;i<p->sbSize;i++){printf("scalesQ[%d]=%ld \n",i,scalesQ[i]);}} //DEBUG
+
+  //int64_t *ECQ=(int64_t*)(outBuf+p->bSize*8); //ECQ is written into outBuf, just be careful when handling it.
+
+  //uint64_t wVal;
+  bp->ECQExt=0;
+  int _1DIdx;
+  bp->ECQ1s=0;
+  bp->ECQOthers=0;
+  double PS_binSize=bp->scalesBinSize*bp->binSize;
+  for(sb=0;sb<p->sbNum;sb++){
+    for(i=0;i<p->sbSize;i++){
+      _1DIdx=sb*p->sbSize+i;
+      ECQ[_1DIdx]=pastri_double_quantize( (scalesQ[sb]*patternQ[i]*PS_binSize-data[_1DIdx]),bp->binSize );
+      double absECQ=abs_FastD(ECQ[_1DIdx]);
+      if(absECQ > bp->ECQExt)
+        bp->ECQExt=absECQ;
+      ////if(DEBUG){printf("EC[%d]: %.6e Quantized:%ld \n",_1DIdx,(scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-data[_1DIdx]),ECQ[_1DIdx]);} //DEBUG
+      switch (ECQ[_1DIdx]){
+        case 0:
+          //ECQ0s++; //Currently not needed
+          break;
+        case 1:
+          bp->ECQ1s++;
+          break;
+        case -1:
+          bp->ECQ1s++;
+          break;
+        default:
+          bp->ECQOthers++;
+          break;
+      }
+    }
+  }
+  
+  /*
+  //DEBUG: Self-check. Remove this later.
+  for(sb=0;sb<p->sbNum;sb++){
+    for(i=0;i<p->sbSize;i++){
+      _1DIdx=sb*p->sbSize+i;
+      double decompressed=scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-ECQ[_1DIdx]*binSize;
+      if(abs_FastD(decompressed-data[_1DIdx])>(p->usedEb)){
+        //printf("p->usedEb=%.6e\n",p->usedEb);
+        //printf("data[%d]=%.6e decompressed[%d]=%.6e diff=%.6e\n",_1DIdx,data[_1DIdx],_1DIdx,decompressed,abs_FastD(data[_1DIdx]-decompressed));
+        assert(0);
+      }
+    }
+  }
+  */
+}
+
+static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ,pastri_params *p,pastri_blockParams* bp,unsigned char* outBuf,int *numOutBytes){
+  bp->ECQBits=bitsNeeded_UI64(bp->ECQExt)+1;
+  bp->_1DIdxBits=bitsNeeded_UI64(p->bSize);
+  //(*numOutBytes)=0;
+  
+  int i;
+  
+  //Encode: 3 options:
+  //Compressed, Sparse ECQ
+  //Compressed, Non-Sparse ECQ
+  //Uncompressed, Sparse Data
+  //Uncompressed, Non-spsarse Data
+  
+  unsigned int UCSparseBits;  //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, nonZeros, {indexes, data}
+  unsigned int UCNonSparseBits;  //Uncompressed, NonSparse bits. Includes: mode, data
+  unsigned int CSparseBits;  //Includes: mode, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
+  unsigned int CNonSparseBits;  //Includes: mode, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
+  //int BOOKKEEPINGBITS=120; //Includes: mode, compressedBytes, patternBits, ECQBits (8+64+32+8+8) //Moved to much earlier!
+    
+  //Consider: ECQ0s, ECQ1s, ECQOthers. Number of following values in ECQ: {0}, {1,-1}, { val<=-2, val>=2}
+  //ECQ0s is actually not needed, but others are needed.
+
+  UCSparseBits = p->dataSize*(1 + 2 + bp->nonZeros*16);  //64 bits for 4 indexes, 64 bit for data.
+  UCNonSparseBits = p->dataSize*(1 + p->bSize*8);
+  bp->numOutliers=bp->ECQ1s+bp->ECQOthers;
+  if(bp->ECQBits==2){
+    CSparseBits = p->dataSize*(1+4+1+1+2) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + bp->ECQ1s*(1+bp->_1DIdxBits);
+    CNonSparseBits = p->dataSize*(1+4+1+1) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + p->bSize + bp->ECQ1s ;  //Or: ECQ0s+ECQ1s*2;
+  }else{ //ECQBits>2
+    CSparseBits = p->dataSize*(1+4+1+1+2) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + bp->ECQ1s*(2+bp->_1DIdxBits) + bp->ECQOthers*(1+bp->_1DIdxBits+bp->ECQBits);
+    //CNonSparseBits = 8+32+8+8+ patternBits*p->sbSize + scaleBits*p->sbNum + p->bSize + ECQ0s + ECQ1s*3 + ECQOthers*(2+ECQBits);
+    CNonSparseBits = p->dataSize*(1+4+1+1)+ bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + p->bSize + bp->ECQ1s*2 + bp->ECQOthers*(1+bp->ECQBits);
+  }
+  
+  int UCSparseBytes=(UCSparseBits+7)/8; 
+  int UCNonSparseBytes=(UCNonSparseBits+7)/8; 
+  int CSparseBytes=(CSparseBits+7)/8; 
+  int CNonSparseBytes=(CNonSparseBits+7)/8; 
+  uint64_t bitPos=0;
+  uint64_t bytePos=0;
+  int i0,i1,i2,i3;
+  int _1DIdx;
+  
+  //*(uint16_t*)(&outBuf[1])=p->idxOffset[0];
+  //*(uint16_t*)(&outBuf[3])=p->idxOffset[1];
+  //*(uint16_t*)(&outBuf[5])=p->idxOffset[2];
+  //*(uint16_t*)(&outBuf[7])=p->idxOffset[3];
+    
+  //if(D_W){printf("ECQ0s:%d ECQ1s:%d ECQOthers:%d Total:%d\n",p->bSize-bp->ECQ1s-bp->ECQOthers,bp->ECQ1s,bp->ECQOthers,p->bSize);} //DEBUG
+  //if(D_W){printf("numOutliers:%d\n",bp->numOutliers);} //DEBUG
+  
+  //****************************************************************************************
+  //if(0){ //DEBUG
+  //W:UCSparse
+  if((UCSparseBytes<UCNonSparseBytes) && (UCSparseBytes<CSparseBytes) && (UCSparseBytes<CNonSparseBytes) ){ 
+    //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, indexOffsets, nonZeros, indexes, data
+    *numOutBytes=UCSparseBytes;
+    //if(D_G){printf("UCSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    outBuf[0]=0; //mode
+    
+    //*(uint16_t*)(&outBuf[9])=nonZeros;
+    //bytePos=11;//0:mode, 1-8:indexOffsets 9-10:NonZeros. So start from 11.
+    *(uint16_t*)(&outBuf[1])=bp->nonZeros;
+    bytePos=3;//0:mode, 2-3:NonZeros. So start from 3.
+    
+    for(i0=0;i0<p->idxRange[0];i0++)
+      for(i1=0;i1<p->idxRange[1];i1++)
+        for(i2=0;i2<p->idxRange[2];i2++)
+          for(i3=0;i3<p->idxRange[3];i3++){
+            _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+            if(abs_FastD(data[_1DIdx])>p->usedEb){
+              //*(uint16_t*)(&outBuf[bytePos])=i0+1+p->idxOffset[0];
+              *(uint16_t*)(&outBuf[bytePos])=i0;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i1+1+p->idxOffset[1];
+              *(uint16_t*)(&outBuf[bytePos])=i1;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i2+1+p->idxOffset[2];
+              *(uint16_t*)(&outBuf[bytePos])=i2;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i3+1+p->idxOffset[3];
+              *(uint16_t*)(&outBuf[bytePos])=i3;
+              bytePos+=2;
+              
+              *(double*)(&outBuf[bytePos])=data[_1DIdx];
+              bytePos+=p->dataSize;
+            }
+          }
+    
+    //if(D_G)printf("UCSparseBytes:%d \n",UCSparseBytes); //DEBUG
+    
+  //****************************************************************************************
+  //}else if(0){ //DEBUG
+  //W:UCNonSparse
+  }else if((UCNonSparseBytes<UCSparseBytes) && (UCNonSparseBytes<CSparseBytes) && (UCNonSparseBytes<CNonSparseBytes) ){ 
+    //Uncompressed, NonSparse bits. Includes: mode, indexOffsets, data
+    *numOutBytes=UCNonSparseBytes;
+    //if(D_G){printf("UCNonSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    outBuf[0]=1; //mode
+    
+    //memcpy(&outBuf[9], &inBuf[p->bSize*8], UCNonSparseBytes-9);
+    memcpy(&outBuf[1], data, p->bSize*p->dataSize);
+    
+    //if(D_G)printf("UCNonSparseBytes:%d \n",UCNonSparseBytes); //DEBUG
+    /*
+    for(i=0;i<UCNonSparseBytes-17;i++){
+      //printf("%d ",inBuf[p->bSize*8+i]);
+    }
+    //printf("\n");
+    for(i=0;i<UCNonSparseBytes-17;i++){
+      //printf("%d ",outBuf[17+i]);
+    }
+    //printf("\n");
+    */
+  //****************************************************************************************
+  //}else if(1){ //DEBUG
+  //W:CSparse
+  }else if((CSparseBytes<UCNonSparseBytes) && (CSparseBytes<UCSparseBytes) && (CSparseBytes<CNonSparseBytes) ){ 
+    //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
+    *numOutBytes=CSparseBytes;
+    //if(D_G){printf("CSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    ////if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    outBuf[0]=2; //mode
+    
+    ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
+    //outBuf[13]=patternBits;
+    //outBuf[14]=ECQBits;
+    ////Currently, we are at the end of 15th byte.
+    //*(uint16_t*)(&outBuf[15])=numOutliers;
+    //bitPos=17*8; //Currently, we are at the end of 17th byte.
+    
+    //outBuf bytes [1:4] are reserved for compressedBytes.
+    outBuf[5]=bp->patternBits;
+    outBuf[6]=bp->ECQBits;
+    //Currently, we are at the end of 7th byte.
+    
+    *(uint16_t*)(&outBuf[7])=bp->numOutliers; 
+    //Now, we are at the end of 9th byte.
+    bitPos=9*8; 
+    
+    ////if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+
+    for(i=0;i<p->sbSize;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
+    }
+    ////if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    for(i=0;i<p->sbNum;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
+    }
+    ////if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    switch(bp->ECQBits){
+      case 2:
+        for(i=0;i<p->bSize;i++){
+          switch(ECQ[i]){
+            case 0:
+              break;
+            case 1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x0\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+              //writeBits_Fast(outBuf,&bitPos,2,0x10);
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              writeBits_Fast(outBuf,&bitPos,1,0);//0x00
+              break;
+            case -1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+              //writeBits_Fast(outBuf,&bitPos,2,0x11);
+              //writeBits_Fast(outBuf,&bitPos,2,1);//0x01
+              //writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,1);
+              break;
+            default:
+              assert(0);
+              break;
+          }
+        }
+        break;
+      default: //ECQBits>2
+      for(i=0;i<p->bSize;i++){
+        switch(ECQ[i]){
+          case 0:
+            break;
+          case 1:
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x00\n",i,ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,3,0);//0x000
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            break;
+          case -1:
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01\n",i,ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,3,1);//0x001
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,1);
+            break;
+          default:
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1 0x%lx\n",i,ECQ[i],ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,2+ECQBits,((uint64_t)0x11<<ECQBits)|ECQ[i]);
+            //writeBits_Fast(outBuf,&bitPos,2+ECQBits,(ECQ[i]&((uint64_t)0x00<<ECQBits))|((uint64_t)0x01<<ECQBits));
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,1);
+            writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
+            break;
+        }
+      }
+      break;
+    }
+    
+    ////if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+          
+
+    uint32_t bytePos=(bitPos+7)/8;
+    //*(uint32_t*)(&outBuf[9])=bytePos;
+    *(uint32_t*)(&outBuf[1])=bytePos;
+    
+    //if(D_G)printf("bitPos:%ld CSparseBits:%d bytePos:%d CSparseBytes:%d\n",bitPos,CSparseBits,bytePos,CSparseBytes); //DEBUG
+    if(D_G){assert(bitPos==CSparseBits);}
+    
+  //****************************************************************************************
+  //W:CNonSparse
+  }else { 
+    //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
+    *numOutBytes=CNonSparseBytes;
+    //if(D_G){printf("CNonSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    ////if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    outBuf[0]=3; //mode
+    
+    ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
+    //outBuf[13]=patternBits;
+    //outBuf[14]=ECQBits;
+    //bitPos=15*8; //Currently, we are at the end of 15th byte.
+    
+    //outBuf bytes [1:4] are reserved for compressedBytes.
+    outBuf[5]=bp->patternBits;
+    outBuf[6]=bp->ECQBits;
+    bitPos=7*8; //Currently, we are at the end of 7th byte.
+    
+    ////if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+
+    for(i=0;i<p->sbSize;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
+    }
+    ////if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    for(i=0;i<p->sbNum;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
+    }
+    ////if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    switch(bp->ECQBits){
+      case 2:
+        for(i=0;i<p->bSize;i++){
+          switch(ECQ[i]){
+            case 0:
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x1\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,1,1);//0x1
+              break;
+            case 1:
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x00\n",i,ECQ[i]); //DEBUG
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              break;
+            case -1:
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x01\n",i,ECQ[i]); //DEBUG
+              //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,1);
+              break;
+            default:
+              assert(0);
+              break;
+          }
+        }
+        break;
+      default: //ECQBits>2
+        ////if(DEBUG) printf("AMG_W1:bitPos:%ld\n",bitPos); //DEBUG
+        for(i=0;i<p->bSize;i++){
+          ////if(DEBUG){printf("AMG_W3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+          ////if(DEBUG) printf("AMG_W2:bitPos:%ld\n",bitPos); //DEBUG
+          ////if(DEBUG) printf("ECQ[%d]:%ld\n",i,ECQ[i]); //DEBUG
+          switch(ECQ[i]){
+            case 0:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              writeBits_Fast(outBuf,&bitPos,1,1);  //0x1
+              //wVal=1; writeBits_Fast(outBuf,&bitPos,1,wVal); //0x1
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            case 1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x000\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,3,0); //0x000
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              //wVal=0; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x000
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            case -1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x001\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,3,8); //0x001
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,1); 
+              //wVal=8; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x001
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            default:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01 0x%lx\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,1); 
+              //wVal=2; writeBits_Fast(outBuf,&bitPos,2,wVal); //0x01
+              writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+          }
+        }
+        break;
+    }
+    
+    ////if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    
+          
+
+    uint32_t bytePos=(bitPos+7)/8;
+    //*(uint32_t*)(&outBuf[9])=bytePos;
+    *(uint32_t*)(&outBuf[1])=bytePos;
+    
+    //if(D_G)printf("bitPos:%ld CNonSparseBits:%d bytePos:%d CNonSparseBytes:%d\n",bitPos,CNonSparseBits,bytePos,CNonSparseBytes); //DEBUG
+    if(D_G){assert(bitPos==CNonSparseBits);}
+    
+  }
+  ////for(i=213;i<233;i++)if(DEBUG)printf("AMG_WE:bitPos:%d buffer[%d]=0x%lx\n",i*8,i,*(uint64_t*)(&outBuf[i])); //DEBUG
+  
+}
+static inline int pastri_double_Compress(unsigned char*inBuf,pastri_params *p,unsigned char*outBuf,int *numOutBytes){
+  pastri_blockParams bp;
+
+  //if(D_G2){printf("Parameters: dataSize:%d\n",p->dataSize);}  //DEBUG
+  //if(D_G2){printf("Parameters: bfs:%d %d %d %d originalEb:%.3e\n",p->bf[0],p->bf[1],p->bf[2],p->bf[3],p->usedEb);}  //DEBUG
+  //if(D_G2){printf("Parameters: idxRanges:%d %d %d %d\n",p->idxRange[0],p->idxRange[1],p->idxRange[2],p->idxRange[3]);} //DEBUG
+  //if(D_G2){printf("Parameters: sbSize:%d sbNum:%d bSize:%d\n",p->sbSize,p->sbNum,p->bSize); }//DEBUG
+  
+  int64_t patternQ[MAX_PS_SIZE];
+  int64_t scalesQ[MAX_PS_SIZE];
+  int64_t ECQ[MAX_BLOCK_SIZE];
+
+  double *data;
+  data=(double*)inBuf;
+  
+  //STEP 0: PREPROCESSING:
+  //This step can include flattening the block, determining the period, etc.
+  //Currently not needed.
+  
+  //STEP 1: PATTERN MATCH
+  pastri_double_PatternMatch(data,p,&bp,patternQ,scalesQ,ECQ);
+  
+  //STEP 2: ENCODING(Include QUANTIZE)
+  pastri_double_Encode(data,patternQ,scalesQ,ECQ,p,&bp,outBuf,numOutBytes);
+  
+
+  return 0;
+}
+
+static inline double pastri_double_InverseQuantization(int64_t q, double binSize){
+  return q*binSize;
+}
+
+static inline void pastri_double_PredictData(pastri_params *p,pastri_blockParams *bp,double *data,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ){
+  int j;
+  double PS_binSize=bp->scalesBinSize*bp->binSize;
+  for(j=0;j<p->bSize;j++){
+    //data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*PS_binSize - ECQ[j]*bp->binSize;
+    data[j]=pastri_double_InverseQuantization(scalesQ[j/p->sbSize]*patternQ[j%p->sbSize],PS_binSize) - pastri_double_InverseQuantization(ECQ[j],bp->binSize);
+  }
+}
+
+static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pastri_blockParams *bp,unsigned char*outBuf,int *numReadBytes,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ){
+  int j;
+  bp->_1DIdxBits=bitsNeeded_UI64(p->bSize);
+  //double *data=(double*)(outBuf+p->bSize*8);
+  double *data=(double*)(outBuf);
+  int i0,i1,i2,i3;
+  //uint16_t *idx0,*idx1,*idx2,*idx3;
+  int _1DIdx;
+
+  int64_t ECQTemp;
+  uint64_t bytePos=0;
+  uint64_t bitPos=0;
+  uint64_t temp,temp2;
+  //int sb,localIdx;
+
+  
+  //idx0=(uint16_t*)(outBuf           );
+  //idx1=(uint16_t*)(outBuf+p->bSize*2);
+  //idx2=(uint16_t*)(outBuf+p->bSize*4);
+  //idx3=(uint16_t*)(outBuf+p->bSize*6);
+  //p->idxOffset[0]=*(uint32_t*)(&inBuf[1]);
+  //p->idxOffset[1]=*(uint32_t*)(&inBuf[3]);
+  //p->idxOffset[2]=*(uint32_t*)(&inBuf[5]);
+  //p->idxOffset[3]=*(uint32_t*)(&inBuf[7]);
+  /*
+  for(i0=0;i0<p->idxRange[0];i0++)
+    for(i1=0;i1<p->idxRange[1];i1++)
+      for(i2=0;i2<p->idxRange[2];i2++)
+        for(i3=0;i3<p->idxRange[3];i3++){
+            //_1DIdx=i0*p->idxRange[1]*p->idxRange[2]*p->idxRange[3]+i1*p->idxRange[2]*p->idxRange[3]+i2*p->idxRange[3]+i3;
+            _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+            idx0[_1DIdx]=i0+1+p->idxOffset[0];
+            idx1[_1DIdx]=i1+1+p->idxOffset[1];
+            idx2[_1DIdx]=i2+1+p->idxOffset[2];
+            idx3[_1DIdx]=i3+1+p->idxOffset[3];
+        }
+  */
+  
+  //*numOutBytes=p->bSize*16;  
+  
+  //inBuf[0] is "mode"
+  switch(inBuf[0]){
+    //R:UCSparse
+    case 0:
+      //if(D_G){printf("\nDC:UCSparse\n");} //DEBUG
+      //bp->nonZeros=*(uint16_t*)(&inBuf[9]);
+      //bytePos=11;
+      bp->nonZeros=*(uint16_t*)(&inBuf[1]);
+      bytePos=3;
+      for(j=0;j<p->bSize;j++){
+          data[j]=0;
+      }
+      for(j=0;j<bp->nonZeros;j++){
+        //i0=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[0]; //i0
+        i0=*(uint16_t*)(&inBuf[bytePos]); //i0
+        bytePos+=2;
+        //i1=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[1]; //i1
+        i1=*(uint16_t*)(&inBuf[bytePos]); //i1
+        bytePos+=2;
+        //i2=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[2]; //i2
+        i2=*(uint16_t*)(&inBuf[bytePos]); //i2
+        bytePos+=2;
+        //i3=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[3]; //i3
+        i3=*(uint16_t*)(&inBuf[bytePos]); //i3
+        bytePos+=2;
+        _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+        data[_1DIdx]=*(double*)(&inBuf[bytePos]);
+        bytePos+=8; 
+      }
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      break;
+    //R:UCNonSparse
+    case 1:
+      //if(D_G){printf("\nDC:UCNonSparse\n");} //DEBUG
+      //memcpy(&outBuf[p->bSize*8], &inBuf[9], p->bSize*8);
+      memcpy(data, &inBuf[1], p->bSize*8);
+      bytePos=p->bSize*8;
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      break;
+    //R:CSparse
+    case 2:
+      //if(D_G){printf("\nDC:CSparse\n");} //DEBUG
+      //for(j=0;j<p->bSize;j++){
+      //  data[j]=0;
+      //}
+      
+      //bp->patternBits=inBuf[13];
+      //bp->ECQBits=inBuf[14];      
+      
+      bp->patternBits=inBuf[5];
+      bp->ECQBits=inBuf[6];
+      
+      //if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      
+      //bp->numOutliers=*(uint16_t*)(&inBuf[15]);
+      //bitPos=17*8;
+      bp->numOutliers=*(uint16_t*)(&inBuf[7]);
+      bitPos=9*8;
+      //if(D_R){printf("bp->numOutliers:%d\n",bp->numOutliers);} //DEBUG
+
+      bp->scalesBinSize=1/(double)(((uint64_t)1<<(bp->patternBits-1))-1);
+  
+      bp->binSize=p->usedEb*2;
+      
+      //if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+
+      for(j=0;j<p->sbSize;j++){
+        patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
+        //if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+      }
+      for(j=0;j<p->sbNum;j++){
+        scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
+        //if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+      }
+      
+      /* //Splitting
+      for(j=0;j<p->bSize;j++){
+        data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
+      }
+      */
+      for(j=0;j<p->bSize;j++){
+        ECQ[j]=0;
+      }
+      switch(bp->ECQBits){
+        case 2:
+          for(j=0;j<bp->numOutliers;j++){
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            
+            _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            ECQTemp=readBits_I64(inBuf,&bitPos,1);
+            ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+            ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            ////data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
+            ECQ[_1DIdx]=ECQTemp;
+            
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+        default: //bp->ECQBits>2
+          //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: bp->ECQBits:%d bp->numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    
+          for(j=0;j<bp->numOutliers;j++){
+            _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            ////if(DEBUG){printf("temp:%ld\n",temp);} //DEBUG
+            switch(temp){
+              case 0:  //+-1
+                ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                break;
+              case 1: //Others
+                ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
+                ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                break;
+              //default:
+              ////  printf("ERROR: Bad 2-bit value: 0x%lx",temp);
+              // assert(0); //AMG
+              //  break;
+            }
+            
+            //data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
+            ECQ[_1DIdx]=ECQTemp;
+            
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+      }
+      //static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,uint64_t numBits){ // numBits must be in range [0:56]
+      //patternQ=(int64_t*)(inBuf+15); 
+      //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
+      
+      bytePos=(bitPos+7)/8;
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      
+      //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
+      pastri_double_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
+
+      break;
+    //R:CNonSparse
+    case 3:
+      //if(D_G){printf("\nDC:CNonSparse\n");} //DEBUG
+      
+      //for(j=0;j<p->bSize;j++){
+      //  data[j]=0;
+      //}
+      
+      //bp->patternBits=inBuf[13];
+      //bp->ECQBits=inBuf[14];
+      
+      bp->patternBits=inBuf[5];
+      bp->ECQBits=inBuf[6];
+      
+      //if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      
+      //bitPos=15*8;
+      bitPos=7*8;
+
+      bp->scalesBinSize=1/(double)(((uint64_t)1<<(bp->patternBits-1))-1);
+      bp->binSize=p->usedEb*2;
+      
+      //if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+
+      for(j=0;j<p->sbSize;j++){
+        patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
+        //if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+      }
+      for(j=0;j<p->sbNum;j++){
+        scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
+        //if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+      }
+      /* //Splitting
+      for(j=0;j<p->bSize;j++){
+        data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
+        ////if(DEBUG){printf("DC:PS[%d]=%.6e\n",j,data[j]);}
+      }
+      */
+      switch(bp->ECQBits){
+        case 2:
+          for(j=0;j<p->bSize;j++){
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            switch(temp){
+              case 0:
+                ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                break;
+              case 1:
+                ECQTemp=0;
+                break;
+              default:
+                assert(0);
+                break;
+            }
+            
+            ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            //data[j]-=ECQTemp*bp->binSize; //Splitting
+            ECQ[j]=ECQTemp;
+            
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+        default: //bp->ECQBits>2
+          ////if(DEBUG)printf("AMG_R1:bitPos: %ld\n",bitPos);
+          
+          for(j=0;j<p->bSize;j++){
+            ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            ////if(DEBUG)printf("AMG_R2:bitPos: %ld\n",bitPos);
+
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            switch(temp){
+              case 0:
+                ////if(DEBUG)printf("Read:0");
+                temp2=readBits_UI64(inBuf,&bitPos,1);
+                switch(temp2){
+                  case 0:
+                    ////if(DEBUG)printf("0");
+                    ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                    ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    ////if(DEBUG)printf("R:ECQTemp:%ld\n",ECQTemp);
+                    ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                    ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    break;
+                  case 1:
+                    ////if(DEBUG)printf("1\n");
+                    ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
+                    ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    break;
+                  default:
+                    assert(0);
+                    break;
+                }
+                break;
+              case 1:
+                ////if(DEBUG)printf("Read:1\n");
+                ECQTemp=0;
+                ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                break;
+              default:
+                assert(0);
+                break;
+            }
+            
+            ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            //data[j]-=ECQTemp*bp->binSize; //Splitting
+            ECQ[j]=ECQTemp;
+            
+            ////if(DEBUG){printf("DC:data[%d]:%.6e\n",j,data[j]);} //DEBUG
+          }
+          break;
+      }
+      //static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,uint64_t numBits){ // numBits must be in range [0:56]
+      //patternQ=(int64_t*)(inBuf+15); 
+      //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
+      bytePos=(bitPos+7)/8;
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      
+      //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
+      pastri_double_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
+      break;
+      
+    default:
+      assert(0);
+      break;
+  } 
+  (*numReadBytes)=bytePos;
+}
+
+static inline void pastri_double_Decompress(unsigned char*inBuf,int dataSize,pastri_params *p,unsigned char*outBuf,int *numReadBytes){
+  int64_t patternQ[MAX_PS_SIZE]; 
+  int64_t scalesQ[MAX_PS_SIZE];
+  int64_t ECQ[MAX_BLOCK_SIZE];
+  
+  pastri_blockParams bp;
+  
+  //STEP 1: DECODE (Includes PREDICT DATA(Includes INVERSE QUANTIZATION))
+  //(Further steps are called inside pastri_double_Decode function)
+  pastri_double_Decode(inBuf,p,&bp,outBuf,numReadBytes,patternQ,scalesQ,ECQ);
+
+  return;
+}
+
+//inBuf vs Decompressed
+static inline int pastri_double_Check(unsigned char*inBuf,int dataSize,unsigned char*DC,pastri_params *p){
+  int i;
+  
+  double *data=(double*)(inBuf);
+  double *data_dc=(double*)(DC);
+  
+  //Comparing Indexes:
+  /*
+  for(i=0;i<p->bSize;i++){
+    if(idx0[i]!=idx0_dc[i]){
+      //printf("idx0[%d]=%d  !=  %d=idx0_dc[%d]",i,idx0[i],idx0_dc[i],i);
+      assert(0);
+    }
+    if(idx1[i]!=idx1_dc[i]){
+      //printf("idx1[%d]=%d  !=  %d=idx1_dc[%d]",i,idx1[i],idx1_dc[i],i);
+      assert(0);
+    }
+    if(idx2[i]!=idx2_dc[i]){
+      //printf("idx2[%d]=%d  !=  %d=idx2_dc[%d]",i,idx2[i],idx2_dc[i],i);
+      assert(0);
+    }
+    if(idx3[i]!=idx3_dc[i]){
+      //printf("idx3[%d]=%d  !=  %d=idx3_dc[%d]",i,idx3[i],idx3_dc[i],i);
+      assert(0);
+    }
+  }
+  */
+  
+  //Comparing Data:
+  for(i=0;i<p->bSize;i++){
+    if(abs_FastD(data[i]-data_dc[i])>p->usedEb){
+      //printf("|data[%d]-data_dc[%d]|>originalEb : %.3e - %.3e = %.3e > %.3e\n",i,i,data[i],data_dc[i],abs_FastD(data[i]-data_dc[i]),p->usedEb);
+      assert(0);
+    }
+  }
+  return 0;
+}
+
+
+#endif
--- a/deps/SZ/sz/include/pastriF.h
+++ b/deps/SZ/sz/include/pastriF.h
@ -0,0 +1,911 @@
+#ifndef PASTRIF_H
+#define PASTRIF_H
+
+static inline int64_t pastri_float_quantize(float x, float binSize){
+  //Add or sub 0.5, depending on the sign:
+  x=x/binSize;
+  
+  u_UI64I64D u1,half;
+  u1.d=x;
+  
+  half.d=0.5;
+  
+  ////printf("pastri_float_quantize:\nx=%lf  x=0x%lx\n",x,(*((uint64_t *)(&x))));
+  ////printf("sign(x):0x%lx\n", x);
+  ////printf("0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  half.ui64 |= (u1.ui64 & (uint64_t)0x8000000000000000);
+  ////printf("sign(x)*0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  return (int64_t)(x + half.d);
+}
+
+static inline void pastri_float_PatternMatch(float*data,pastri_params* p,pastri_blockParams* bp,int64_t* patternQ,int64_t *scalesQ, int64_t* ECQ){
+  //Find the pattern.
+  //First, find the extremum point:
+  float absExt=0; //Absolute value of Extremum
+  int extIdx=-1; //Index of Extremum
+  bp->nonZeros=0;
+  int i,sb;
+  for(i=0;i<p->bSize;i++){
+    ////printf("data[%d] = %.16lf\n",i,data[i]);//DEBUG
+    if(abs_FastD(data[i])>p->usedEb){
+      bp->nonZeros++;
+      ////if(DEBUG)printf("data[%d]:%.6e\n",i,data[i]); //DEBUG
+    }
+    if(abs_FastD(data[i])>absExt){
+      absExt=abs_FastD(data[i]);
+      extIdx=i;
+    }
+  }
+  int patternIdx; //Starting Index of Pattern
+  patternIdx=(extIdx/p->sbSize)*p->sbSize;
+  
+  float patternExt=data[extIdx];
+  bp->binSize=2*p->usedEb;
+  
+  ////if(DEBUG){printf("Extremum  : data[%d] = %.6e\n",extIdx,patternExt);} //DEBUG
+  ////if(DEBUG){printf("patternIdx: %d\n",patternIdx);} //DEBUG
+  
+  ////if(DEBUG){for(i=0;i<p->sbSize;i++){printf("pattern[%d]=data[%d]=%.6e Quantized:%d\n",i,patternIdx+i,data[patternIdx+i],pastri_float_quantize(data[patternIdx+i]/binSize)  );}   }//DEBUG
+  
+  //int64_t *patternQ=(int64_t*)(outBuf+15);  //Possible Improvement!
+
+  
+  for(i=0;i<p->sbSize;i++){
+    patternQ[i]=pastri_float_quantize(data[patternIdx+i],bp->binSize);
+    //if(D_W){printf("patternQ[%d]=%ld\n",i,patternQ[i]);}
+  }
+  
+  bp->patternBits=bitsNeeded_float((abs_FastD(patternExt)/bp->binSize)+1)+1;
+  bp->scaleBits=bp->patternBits;
+  bp->scalesBinSize=1/(float)(((uint64_t)1<<(bp->scaleBits-1))-1);
+  ////if(DEBUG){printf("(patternExt/binSize)+1: %.6e\n",(patternExt/binSize)+1);} //DEBUG
+  ////if(DEBUG){printf("scaleBits=patternBits: %d\n",scaleBits);} //DEBUG
+  //if(D_W){printf("scalesBinSize: %.6e\n",bp->scalesBinSize);} //DEBUG
+  
+  //Calculate Scales.
+  //The index part of the input buffer will be reused to hold Scale, Pattern, etc. values.
+  int localExtIdx=extIdx%p->sbSize; //Local extremum index. This is not the actual extremum of the current sb, but rather the index that correspond to the global (block) extremum.
+  //int64_t *scalesQ=(int64_t*)(outBuf+15+p->sbSize*8);  //Possible Improvement!
+  int patternExtZero=(patternExt==0);
+  ////if(DEBUG){printf("patternExtZero: %d\n",patternExtZero);} //DEBUG
+  for(sb=0;sb<p->sbNum;sb++){
+    //scales[sb]=data[sb*p->sbSize+localExtIdx]/patternExt;
+    //scales[sb]=patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt;
+    //assert(scales[sb]<=1);
+    scalesQ[sb]=pastri_float_quantize((patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt),bp->scalesBinSize);
+    //if(D_W){printf("scalesQ[%d]=%ld\n",sb,scalesQ[sb]);}
+  }
+  ////if(DEBUG){for(i=0;i<p->sbSize;i++){printf("scalesQ[%d]=%ld \n",i,scalesQ[i]);}} //DEBUG
+
+  //int64_t *ECQ=(int64_t*)(outBuf+p->bSize*8); //ECQ is written into outBuf, just be careful when handling it.
+
+  //uint64_t wVal;
+  bp->ECQExt=0;
+  int _1DIdx;
+  bp->ECQ1s=0;
+  bp->ECQOthers=0;
+  float PS_binSize=bp->scalesBinSize*bp->binSize;
+  for(sb=0;sb<p->sbNum;sb++){
+    for(i=0;i<p->sbSize;i++){
+      _1DIdx=sb*p->sbSize+i;
+      ECQ[_1DIdx]=pastri_float_quantize( (scalesQ[sb]*patternQ[i]*PS_binSize-data[_1DIdx]),bp->binSize );
+      float absECQ=abs_FastD(ECQ[_1DIdx]);
+      if(absECQ > bp->ECQExt)
+        bp->ECQExt=absECQ;
+      ////if(DEBUG){printf("EC[%d]: %.6e Quantized:%ld \n",_1DIdx,(scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-data[_1DIdx]),ECQ[_1DIdx]);} //DEBUG
+      switch (ECQ[_1DIdx]){
+        case 0:
+          //ECQ0s++; //Currently not needed
+          break;
+        case 1:
+          bp->ECQ1s++;
+          break;
+        case -1:
+          bp->ECQ1s++;
+          break;
+        default:
+          bp->ECQOthers++;
+          break;
+      }
+    }
+  }
+  
+  /*
+  //DEBUG: Self-check. Remove this later.
+  for(sb=0;sb<p->sbNum;sb++){
+    for(i=0;i<p->sbSize;i++){
+      _1DIdx=sb*p->sbSize+i;
+      float decompressed=scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-ECQ[_1DIdx]*binSize;
+      if(abs_FastD(decompressed-data[_1DIdx])>(p->usedEb)){
+        //printf("p->usedEb=%.6e\n",p->usedEb);
+        //printf("data[%d]=%.6e decompressed[%d]=%.6e diff=%.6e\n",_1DIdx,data[_1DIdx],_1DIdx,decompressed,abs_FastD(data[_1DIdx]-decompressed));
+        assert(0);
+      }
+    }
+  }
+  */
+}
+
+static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ,pastri_params *p,pastri_blockParams* bp,unsigned char* outBuf,int *numOutBytes){
+  bp->ECQBits=bitsNeeded_UI64(bp->ECQExt)+1;
+  bp->_1DIdxBits=bitsNeeded_UI64(p->bSize);
+  //(*numOutBytes)=0;
+  
+  int i;
+  
+  //Encode: 3 options:
+  //Compressed, Sparse ECQ
+  //Compressed, Non-Sparse ECQ
+  //Uncompressed, Sparse Data
+  //Uncompressed, Non-spsarse Data
+  
+  unsigned int UCSparseBits;  //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, nonZeros, {indexes, data}
+  unsigned int UCNonSparseBits;  //Uncompressed, NonSparse bits. Includes: mode, data
+  unsigned int CSparseBits;  //Includes: mode, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
+  unsigned int CNonSparseBits;  //Includes: mode, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
+  //int BOOKKEEPINGBITS=120; //Includes: mode, compressedBytes, patternBits, ECQBits (8+64+32+8+8) //Moved to much earlier!
+    
+  //Consider: ECQ0s, ECQ1s, ECQOthers. Number of following values in ECQ: {0}, {1,-1}, { val<=-2, val>=2}
+  //ECQ0s is actually not needed, but others are needed.
+
+  UCSparseBits = p->dataSize*(1 + 2 + bp->nonZeros*16);  //64 bits for 4 indexes, 64 bit for data.
+  UCNonSparseBits = p->dataSize*(1 + p->bSize*8);
+  bp->numOutliers=bp->ECQ1s+bp->ECQOthers;
+  if(bp->ECQBits==2){
+    CSparseBits = p->dataSize*(1+4+1+1+2) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + bp->ECQ1s*(1+bp->_1DIdxBits);
+    CNonSparseBits = p->dataSize*(1+4+1+1) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + p->bSize + bp->ECQ1s ;  //Or: ECQ0s+ECQ1s*2;
+  }else{ //ECQBits>2
+    CSparseBits = p->dataSize*(1+4+1+1+2) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + bp->ECQ1s*(2+bp->_1DIdxBits) + bp->ECQOthers*(1+bp->_1DIdxBits+bp->ECQBits);
+    //CNonSparseBits = 8+32+8+8+ patternBits*p->sbSize + scaleBits*p->sbNum + p->bSize + ECQ0s + ECQ1s*3 + ECQOthers*(2+ECQBits);
+    CNonSparseBits = p->dataSize*(1+4+1+1)+ bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + p->bSize + bp->ECQ1s*2 + bp->ECQOthers*(1+bp->ECQBits);
+  }
+  
+  int UCSparseBytes=(UCSparseBits+7)/8; 
+  int UCNonSparseBytes=(UCNonSparseBits+7)/8; 
+  int CSparseBytes=(CSparseBits+7)/8; 
+  int CNonSparseBytes=(CNonSparseBits+7)/8; 
+  uint64_t bitPos=0;
+  uint64_t bytePos=0;
+  int i0,i1,i2,i3;
+  int _1DIdx;
+  
+  //*(uint16_t*)(&outBuf[1])=p->idxOffset[0];
+  //*(uint16_t*)(&outBuf[3])=p->idxOffset[1];
+  //*(uint16_t*)(&outBuf[5])=p->idxOffset[2];
+  //*(uint16_t*)(&outBuf[7])=p->idxOffset[3];
+    
+  //if(D_W){printf("ECQ0s:%d ECQ1s:%d ECQOthers:%d Total:%d\n",p->bSize-bp->ECQ1s-bp->ECQOthers,bp->ECQ1s,bp->ECQOthers,p->bSize);} //DEBUG
+  //if(D_W){printf("numOutliers:%d\n",bp->numOutliers);} //DEBUG
+  
+  //****************************************************************************************
+  //if(0){ //DEBUG
+  //W:UCSparse
+  if((UCSparseBytes<UCNonSparseBytes) && (UCSparseBytes<CSparseBytes) && (UCSparseBytes<CNonSparseBytes) ){ 
+    //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, indexOffsets, nonZeros, indexes, data
+    *numOutBytes=UCSparseBytes;
+    //if(D_G){printf("UCSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    outBuf[0]=0; //mode
+    
+    //*(uint16_t*)(&outBuf[9])=nonZeros;
+    //bytePos=11;//0:mode, 1-8:indexOffsets 9-10:NonZeros. So start from 11.
+    *(uint16_t*)(&outBuf[1])=bp->nonZeros;
+    bytePos=3;//0:mode, 2-3:NonZeros. So start from 3.
+    
+    for(i0=0;i0<p->idxRange[0];i0++)
+      for(i1=0;i1<p->idxRange[1];i1++)
+        for(i2=0;i2<p->idxRange[2];i2++)
+          for(i3=0;i3<p->idxRange[3];i3++){
+            _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+            if(abs_FastD(data[_1DIdx])>p->usedEb){
+              //*(uint16_t*)(&outBuf[bytePos])=i0+1+p->idxOffset[0];
+              *(uint16_t*)(&outBuf[bytePos])=i0;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i1+1+p->idxOffset[1];
+              *(uint16_t*)(&outBuf[bytePos])=i1;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i2+1+p->idxOffset[2];
+              *(uint16_t*)(&outBuf[bytePos])=i2;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i3+1+p->idxOffset[3];
+              *(uint16_t*)(&outBuf[bytePos])=i3;
+              bytePos+=2;
+              
+              *(float*)(&outBuf[bytePos])=data[_1DIdx];
+              bytePos+=p->dataSize;
+            }
+          }
+    
+    //if(D_G)printf("UCSparseBytes:%d \n",UCSparseBytes); //DEBUG
+    
+  //****************************************************************************************
+  //}else if(0){ //DEBUG
+  //W:UCNonSparse
+  }else if((UCNonSparseBytes<UCSparseBytes) && (UCNonSparseBytes<CSparseBytes) && (UCNonSparseBytes<CNonSparseBytes) ){ 
+    //Uncompressed, NonSparse bits. Includes: mode, indexOffsets, data
+    *numOutBytes=UCNonSparseBytes;
+    //if(D_G){printf("UCNonSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    outBuf[0]=1; //mode
+    
+    //memcpy(&outBuf[9], &inBuf[p->bSize*8], UCNonSparseBytes-9);
+    memcpy(&outBuf[1], data, p->bSize*p->dataSize);
+    
+    //if(D_G)printf("UCNonSparseBytes:%d \n",UCNonSparseBytes); //DEBUG
+    /*
+    for(i=0;i<UCNonSparseBytes-17;i++){
+      //printf("%d ",inBuf[p->bSize*8+i]);
+    }
+    //printf("\n");
+    for(i=0;i<UCNonSparseBytes-17;i++){
+      //printf("%d ",outBuf[17+i]);
+    }
+    //printf("\n");
+    */
+  //****************************************************************************************
+  //}else if(1){ //DEBUG
+  //W:CSparse
+  }else if((CSparseBytes<UCNonSparseBytes) && (CSparseBytes<UCSparseBytes) && (CSparseBytes<CNonSparseBytes) ){ 
+    //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
+    *numOutBytes=CSparseBytes;
+    //if(D_G){printf("CSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    ////if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    outBuf[0]=2; //mode
+    
+    ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
+    //outBuf[13]=patternBits;
+    //outBuf[14]=ECQBits;
+    ////Currently, we are at the end of 15th byte.
+    //*(uint16_t*)(&outBuf[15])=numOutliers;
+    //bitPos=17*8; //Currently, we are at the end of 17th byte.
+    
+    //outBuf bytes [1:4] are reserved for compressedBytes.
+    outBuf[5]=bp->patternBits;
+    outBuf[6]=bp->ECQBits;
+    //Currently, we are at the end of 7th byte.
+    
+    *(uint16_t*)(&outBuf[7])=bp->numOutliers; 
+    //Now, we are at the end of 9th byte.
+    bitPos=9*8; 
+    
+    ////if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+
+    for(i=0;i<p->sbSize;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
+    }
+    ////if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    for(i=0;i<p->sbNum;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
+    }
+    ////if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    switch(bp->ECQBits){
+      case 2:
+        for(i=0;i<p->bSize;i++){
+          switch(ECQ[i]){
+            case 0:
+              break;
+            case 1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x0\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+              //writeBits_Fast(outBuf,&bitPos,2,0x10);
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              writeBits_Fast(outBuf,&bitPos,1,0);//0x00
+              break;
+            case -1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+              //writeBits_Fast(outBuf,&bitPos,2,0x11);
+              //writeBits_Fast(outBuf,&bitPos,2,1);//0x01
+              //writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,1);
+              break;
+            default:
+              assert(0);
+              break;
+          }
+        }
+        break;
+      default: //ECQBits>2
+      for(i=0;i<p->bSize;i++){
+        switch(ECQ[i]){
+          case 0:
+            break;
+          case 1:
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x00\n",i,ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,3,0);//0x000
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            break;
+          case -1:
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01\n",i,ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,3,1);//0x001
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,1);
+            break;
+          default:
+            ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1 0x%lx\n",i,ECQ[i],ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,2+ECQBits,((uint64_t)0x11<<ECQBits)|ECQ[i]);
+            //writeBits_Fast(outBuf,&bitPos,2+ECQBits,(ECQ[i]&((uint64_t)0x00<<ECQBits))|((uint64_t)0x01<<ECQBits));
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,1);
+            writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
+            break;
+        }
+      }
+      break;
+    }
+    
+    ////if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+          
+
+    uint32_t bytePos=(bitPos+7)/8;
+    //*(uint32_t*)(&outBuf[9])=bytePos;
+    *(uint32_t*)(&outBuf[1])=bytePos;
+    
+    //if(D_G)printf("bitPos:%ld CSparseBits:%d bytePos:%d CSparseBytes:%d\n",bitPos,CSparseBits,bytePos,CSparseBytes); //DEBUG
+    if(D_G){assert(bitPos==CSparseBits);}
+    
+  //****************************************************************************************
+  //W:CNonSparse
+  }else { 
+    //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
+    *numOutBytes=CNonSparseBytes;
+    //if(D_G){printf("CNonSparse\n");} //DEBUG
+    //if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    ////if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    outBuf[0]=3; //mode
+    
+    ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
+    //outBuf[13]=patternBits;
+    //outBuf[14]=ECQBits;
+    //bitPos=15*8; //Currently, we are at the end of 15th byte.
+    
+    //outBuf bytes [1:4] are reserved for compressedBytes.
+    outBuf[5]=bp->patternBits;
+    outBuf[6]=bp->ECQBits;
+    bitPos=7*8; //Currently, we are at the end of 7th byte.
+    
+    ////if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+
+    for(i=0;i<p->sbSize;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
+    }
+    ////if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    for(i=0;i<p->sbNum;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
+    }
+    ////if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    ////if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    switch(bp->ECQBits){
+      case 2:
+        for(i=0;i<p->bSize;i++){
+          switch(ECQ[i]){
+            case 0:
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x1\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,1,1);//0x1
+              break;
+            case 1:
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x00\n",i,ECQ[i]); //DEBUG
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              break;
+            case -1:
+              ////if(DEBUG)printf("Index:%d ECQ:%d Written:0x01\n",i,ECQ[i]); //DEBUG
+              //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,1);
+              break;
+            default:
+              assert(0);
+              break;
+          }
+        }
+        break;
+      default: //ECQBits>2
+        ////if(DEBUG) printf("AMG_W1:bitPos:%ld\n",bitPos); //DEBUG
+        for(i=0;i<p->bSize;i++){
+          ////if(DEBUG){printf("AMG_W3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+          ////if(DEBUG) printf("AMG_W2:bitPos:%ld\n",bitPos); //DEBUG
+          ////if(DEBUG) printf("ECQ[%d]:%ld\n",i,ECQ[i]); //DEBUG
+          switch(ECQ[i]){
+            case 0:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              writeBits_Fast(outBuf,&bitPos,1,1);  //0x1
+              //wVal=1; writeBits_Fast(outBuf,&bitPos,1,wVal); //0x1
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            case 1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x000\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,3,0); //0x000
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              //wVal=0; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x000
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            case -1:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x001\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,3,8); //0x001
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,1); 
+              //wVal=8; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x001
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            default:
+              ////if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01 0x%lx\n",i,ECQ[i]); //DEBUG
+              ////if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,1); 
+              //wVal=2; writeBits_Fast(outBuf,&bitPos,2,wVal); //0x01
+              writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
+              ////if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+          }
+        }
+        break;
+    }
+    
+    ////if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    
+          
+
+    uint32_t bytePos=(bitPos+7)/8;
+    //*(uint32_t*)(&outBuf[9])=bytePos;
+    *(uint32_t*)(&outBuf[1])=bytePos;
+    
+    //if(D_G)printf("bitPos:%ld CNonSparseBits:%d bytePos:%d CNonSparseBytes:%d\n",bitPos,CNonSparseBits,bytePos,CNonSparseBytes); //DEBUG
+    if(D_G){assert(bitPos==CNonSparseBits);}
+    
+  }
+  ////for(i=213;i<233;i++)if(DEBUG)printf("AMG_WE:bitPos:%d buffer[%d]=0x%lx\n",i*8,i,*(uint64_t*)(&outBuf[i])); //DEBUG
+  
+}
+static inline int pastri_float_Compress(unsigned char*inBuf,pastri_params *p,unsigned char*outBuf,int *numOutBytes){
+  pastri_blockParams bp;
+
+  //if(D_G2){printf("Parameters: dataSize:%d\n",p->dataSize);}  //DEBUG
+  //if(D_G2){printf("Parameters: bfs:%d %d %d %d originalEb:%.3e\n",p->bf[0],p->bf[1],p->bf[2],p->bf[3],p->usedEb);}  //DEBUG
+  //if(D_G2){printf("Parameters: idxRanges:%d %d %d %d\n",p->idxRange[0],p->idxRange[1],p->idxRange[2],p->idxRange[3]);} //DEBUG
+  //if(D_G2){printf("Parameters: sbSize:%d sbNum:%d bSize:%d\n",p->sbSize,p->sbNum,p->bSize); }//DEBUG
+  
+  int64_t patternQ[MAX_PS_SIZE];
+  int64_t scalesQ[MAX_PS_SIZE];
+  int64_t ECQ[MAX_BLOCK_SIZE];
+
+  float *data;
+  data=(float*)inBuf;
+  
+  //STEP 0: PREPROCESSING:
+  //This step can include flattening the block, determining the period, etc.
+  //Currently not needed.
+  
+  //STEP 1: PATTERN MATCH
+  pastri_float_PatternMatch(data,p,&bp,patternQ,scalesQ,ECQ);
+  
+  //STEP 2: ENCODING(Include QUANTIZE)
+  pastri_float_Encode(data,patternQ,scalesQ,ECQ,p,&bp,outBuf,numOutBytes);
+  
+
+  return 0;
+}
+
+static inline float pastri_float_InverseQuantization(int64_t q, float binSize){
+  return q*binSize;
+}
+
+static inline void pastri_float_PredictData(pastri_params *p,pastri_blockParams *bp,float *data,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ){
+  int j;
+  float PS_binSize=bp->scalesBinSize*bp->binSize;
+  for(j=0;j<p->bSize;j++){
+    //data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*PS_binSize - ECQ[j]*bp->binSize;
+    data[j]=pastri_float_InverseQuantization(scalesQ[j/p->sbSize]*patternQ[j%p->sbSize],PS_binSize) - pastri_float_InverseQuantization(ECQ[j],bp->binSize);
+  }
+}
+
+static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,pastri_blockParams *bp,unsigned char*outBuf,int *numReadBytes,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ){
+  int j;
+  bp->_1DIdxBits=bitsNeeded_UI64(p->bSize);
+  //float *data=(float*)(outBuf+p->bSize*8);
+  float *data=(float*)(outBuf);
+  int i0,i1,i2,i3;
+  //uint16_t *idx0,*idx1,*idx2,*idx3;
+  int _1DIdx;
+
+  int64_t ECQTemp;
+  uint64_t bytePos=0;
+  uint64_t bitPos=0;
+  uint64_t temp,temp2;
+  //int sb,localIdx;
+
+  
+  //idx0=(uint16_t*)(outBuf           );
+  //idx1=(uint16_t*)(outBuf+p->bSize*2);
+  //idx2=(uint16_t*)(outBuf+p->bSize*4);
+  //idx3=(uint16_t*)(outBuf+p->bSize*6);
+  //p->idxOffset[0]=*(uint32_t*)(&inBuf[1]);
+  //p->idxOffset[1]=*(uint32_t*)(&inBuf[3]);
+  //p->idxOffset[2]=*(uint32_t*)(&inBuf[5]);
+  //p->idxOffset[3]=*(uint32_t*)(&inBuf[7]);
+  /*
+  for(i0=0;i0<p->idxRange[0];i0++)
+    for(i1=0;i1<p->idxRange[1];i1++)
+      for(i2=0;i2<p->idxRange[2];i2++)
+        for(i3=0;i3<p->idxRange[3];i3++){
+            //_1DIdx=i0*p->idxRange[1]*p->idxRange[2]*p->idxRange[3]+i1*p->idxRange[2]*p->idxRange[3]+i2*p->idxRange[3]+i3;
+            _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+            idx0[_1DIdx]=i0+1+p->idxOffset[0];
+            idx1[_1DIdx]=i1+1+p->idxOffset[1];
+            idx2[_1DIdx]=i2+1+p->idxOffset[2];
+            idx3[_1DIdx]=i3+1+p->idxOffset[3];
+        }
+  */
+  
+  //*numOutBytes=p->bSize*16;  
+  
+  //inBuf[0] is "mode"
+  switch(inBuf[0]){
+    //R:UCSparse
+    case 0:
+      //if(D_G){printf("\nDC:UCSparse\n");} //DEBUG
+      //bp->nonZeros=*(uint16_t*)(&inBuf[9]);
+      //bytePos=11;
+      bp->nonZeros=*(uint16_t*)(&inBuf[1]);
+      bytePos=3;
+      for(j=0;j<p->bSize;j++){
+          data[j]=0;
+      }
+      for(j=0;j<bp->nonZeros;j++){
+        //i0=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[0]; //i0
+        i0=*(uint16_t*)(&inBuf[bytePos]); //i0
+        bytePos+=2;
+        //i1=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[1]; //i1
+        i1=*(uint16_t*)(&inBuf[bytePos]); //i1
+        bytePos+=2;
+        //i2=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[2]; //i2
+        i2=*(uint16_t*)(&inBuf[bytePos]); //i2
+        bytePos+=2;
+        //i3=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[3]; //i3
+        i3=*(uint16_t*)(&inBuf[bytePos]); //i3
+        bytePos+=2;
+        _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+        data[_1DIdx]=*(float*)(&inBuf[bytePos]);
+        bytePos+=8; 
+      }
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      break;
+    //R:UCNonSparse
+    case 1:
+      //if(D_G){printf("\nDC:UCNonSparse\n");} //DEBUG
+      //memcpy(&outBuf[p->bSize*8], &inBuf[9], p->bSize*8);
+      memcpy(data, &inBuf[1], p->bSize*8);
+      bytePos=p->bSize*8;
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      break;
+    //R:CSparse
+    case 2:
+      //if(D_G){printf("\nDC:CSparse\n");} //DEBUG
+      //for(j=0;j<p->bSize;j++){
+      //  data[j]=0;
+      //}
+      
+      //bp->patternBits=inBuf[13];
+      //bp->ECQBits=inBuf[14];      
+      
+      bp->patternBits=inBuf[5];
+      bp->ECQBits=inBuf[6];
+      
+      //if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      
+      //bp->numOutliers=*(uint16_t*)(&inBuf[15]);
+      //bitPos=17*8;
+      bp->numOutliers=*(uint16_t*)(&inBuf[7]);
+      bitPos=9*8;
+      //if(D_R){printf("bp->numOutliers:%d\n",bp->numOutliers);} //DEBUG
+
+      bp->scalesBinSize=1/(float)(((uint64_t)1<<(bp->patternBits-1))-1);
+  
+      bp->binSize=p->usedEb*2;
+      
+      //if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+
+      for(j=0;j<p->sbSize;j++){
+        patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
+        //if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+      }
+      for(j=0;j<p->sbNum;j++){
+        scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
+        //if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+      }
+      
+      /* //Splitting
+      for(j=0;j<p->bSize;j++){
+        data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
+      }
+      */
+      for(j=0;j<p->bSize;j++){
+        ECQ[j]=0;
+      }
+      switch(bp->ECQBits){
+        case 2:
+          for(j=0;j<bp->numOutliers;j++){
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            
+            _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            ECQTemp=readBits_I64(inBuf,&bitPos,1);
+            ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+            ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            ////data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
+            ECQ[_1DIdx]=ECQTemp;
+            
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+        default: //bp->ECQBits>2
+          //if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: bp->ECQBits:%d bp->numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    
+          for(j=0;j<bp->numOutliers;j++){
+            _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            ////if(DEBUG){printf("temp:%ld\n",temp);} //DEBUG
+            switch(temp){
+              case 0:  //+-1
+                ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                break;
+              case 1: //Others
+                ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
+                ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                ////if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                break;
+              //default:
+              ////  printf("ERROR: Bad 2-bit value: 0x%lx",temp);
+              // assert(0); //AMG
+              //  break;
+            }
+            
+            //data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
+            ECQ[_1DIdx]=ECQTemp;
+            
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+      }
+      //static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,uint64_t numBits){ // numBits must be in range [0:56]
+      //patternQ=(int64_t*)(inBuf+15); 
+      //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
+      
+      bytePos=(bitPos+7)/8;
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      
+      //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
+      pastri_float_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
+
+      break;
+    //R:CNonSparse
+    case 3:
+      //if(D_G){printf("\nDC:CNonSparse\n");} //DEBUG
+      
+      //for(j=0;j<p->bSize;j++){
+      //  data[j]=0;
+      //}
+      
+      //bp->patternBits=inBuf[13];
+      //bp->ECQBits=inBuf[14];
+      
+      bp->patternBits=inBuf[5];
+      bp->ECQBits=inBuf[6];
+      
+      //if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      
+      //bitPos=15*8;
+      bitPos=7*8;
+
+      bp->scalesBinSize=1/(float)(((uint64_t)1<<(bp->patternBits-1))-1);
+      bp->binSize=p->usedEb*2;
+      
+      //if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+
+      for(j=0;j<p->sbSize;j++){
+        patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
+        //if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+      }
+      for(j=0;j<p->sbNum;j++){
+        scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
+        //if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+      }
+      /* //Splitting
+      for(j=0;j<p->bSize;j++){
+        data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
+        ////if(DEBUG){printf("DC:PS[%d]=%.6e\n",j,data[j]);}
+      }
+      */
+      switch(bp->ECQBits){
+        case 2:
+          for(j=0;j<p->bSize;j++){
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            switch(temp){
+              case 0:
+                ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                break;
+              case 1:
+                ECQTemp=0;
+                break;
+              default:
+                assert(0);
+                break;
+            }
+            
+            ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            //data[j]-=ECQTemp*bp->binSize; //Splitting
+            ECQ[j]=ECQTemp;
+            
+            ////if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+        default: //bp->ECQBits>2
+          ////if(DEBUG)printf("AMG_R1:bitPos: %ld\n",bitPos);
+          
+          for(j=0;j<p->bSize;j++){
+            ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            ////if(DEBUG)printf("AMG_R2:bitPos: %ld\n",bitPos);
+
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            ////if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            switch(temp){
+              case 0:
+                ////if(DEBUG)printf("Read:0");
+                temp2=readBits_UI64(inBuf,&bitPos,1);
+                switch(temp2){
+                  case 0:
+                    ////if(DEBUG)printf("0");
+                    ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                    ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    ////if(DEBUG)printf("R:ECQTemp:%ld\n",ECQTemp);
+                    ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                    ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    break;
+                  case 1:
+                    ////if(DEBUG)printf("1\n");
+                    ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
+                    ////if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    break;
+                  default:
+                    assert(0);
+                    break;
+                }
+                break;
+              case 1:
+                ////if(DEBUG)printf("Read:1\n");
+                ECQTemp=0;
+                ////if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                break;
+              default:
+                assert(0);
+                break;
+            }
+            
+            ////if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            //data[j]-=ECQTemp*bp->binSize; //Splitting
+            ECQ[j]=ECQTemp;
+            
+            ////if(DEBUG){printf("DC:data[%d]:%.6e\n",j,data[j]);} //DEBUG
+          }
+          break;
+      }
+      //static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,uint64_t numBits){ // numBits must be in range [0:56]
+      //patternQ=(int64_t*)(inBuf+15); 
+      //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
+      bytePos=(bitPos+7)/8;
+      //if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      
+      //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
+      pastri_float_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
+      break;
+      
+    default:
+      assert(0);
+      break;
+  } 
+  (*numReadBytes)=bytePos;
+}
+
+static inline void pastri_float_Decompress(unsigned char*inBuf,int dataSize,pastri_params *p,unsigned char*outBuf,int *numReadBytes){
+  int64_t patternQ[MAX_PS_SIZE]; 
+  int64_t scalesQ[MAX_PS_SIZE];
+  int64_t ECQ[MAX_BLOCK_SIZE];
+  
+  pastri_blockParams bp;
+  
+  //STEP 1: DECODE (Includes PREDICT DATA(Includes INVERSE QUANTIZATION))
+  //(Further steps are called inside pastri_float_Decode function)
+  pastri_float_Decode(inBuf,p,&bp,outBuf,numReadBytes,patternQ,scalesQ,ECQ);
+
+  return;
+}
+
+//inBuf vs Decompressed
+static inline int pastri_float_Check(unsigned char*inBuf,int dataSize,unsigned char*DC,pastri_params *p){
+  int i;
+  
+  float *data=(float*)(inBuf);
+  float *data_dc=(float*)(DC);
+  
+  //Comparing Indexes:
+  /*
+  for(i=0;i<p->bSize;i++){
+    if(idx0[i]!=idx0_dc[i]){
+      //printf("idx0[%d]=%d  !=  %d=idx0_dc[%d]",i,idx0[i],idx0_dc[i],i);
+      assert(0);
+    }
+    if(idx1[i]!=idx1_dc[i]){
+      //printf("idx1[%d]=%d  !=  %d=idx1_dc[%d]",i,idx1[i],idx1_dc[i],i);
+      assert(0);
+    }
+    if(idx2[i]!=idx2_dc[i]){
+      //printf("idx2[%d]=%d  !=  %d=idx2_dc[%d]",i,idx2[i],idx2_dc[i],i);
+      assert(0);
+    }
+    if(idx3[i]!=idx3_dc[i]){
+      //printf("idx3[%d]=%d  !=  %d=idx3_dc[%d]",i,idx3[i],idx3_dc[i],i);
+      assert(0);
+    }
+  }
+  */
+  
+  //Comparing Data:
+  for(i=0;i<p->bSize;i++){
+    if(abs_FastD(data[i]-data_dc[i])>p->usedEb){
+      //printf("|data[%d]-data_dc[%d]|>originalEb : %.3e - %.3e = %.3e > %.3e\n",i,i,data[i],data_dc[i],abs_FastD(data[i]-data_dc[i]),p->usedEb);
+      assert(0);
+    }
+  }
+  return 0;
+}
+
+
+#endif
--- a/deps/SZ/sz/include/pastriGeneral.h
+++ b/deps/SZ/sz/include/pastriGeneral.h
@ -0,0 +1,205 @@
+#ifndef PASTRIGENERAL_H
+#define PASTRIGENERAL_H
+
+
+static inline double abs_FastD(double x){
+  u_UI64I64D u1;
+  u1.d=x;
+  //(*((uint64_t *)(&x)))&=(int64_t)0x7FFFFFFFFFFFFFFF;
+  u1.ui64&=(int64_t)0x7FFFFFFFFFFFFFFF;
+  return u1.d;
+}
+
+static inline int64_t abs_FastI64(int64_t x){
+  return (x^((x&(int64_t)0x8000000000000000)>>63))+((x&(int64_t)0x8000000000000000)!=0);
+}
+/*
+int abs(int x) {
+   int mask = (x >> (sizeof(int) * CHAR_BIT - 1));
+   return (x + mask) ^ mask;
+}
+*/
+
+
+
+
+//Returns the min. bits needed to represent x.
+//Same as: ceil(log2(abs(x))) 
+//Actually to be completely safe, it correspond to: ceil(log2(abs(i)+1))+0.1
+//+0.1 was for fixing rounding errors
+//REMEMBER: To represent the whole range [-x:x], the number of bits required is bitsNeeded(x)+1
+static inline int bitsNeeded_double(double x){
+  u_UI64I64D u1;
+  u1.d=x;
+  return (((u1.ui64<<1)>>53)-1022) & (((x!=0)<<31)>>31);
+}
+
+//Returns the min. bits needed to represent x.
+//Same as: ceil(log2(abs(x))) 
+//NEEDS OPTIMIZATION!
+static inline int bitsNeeded_float(float x){
+  u_UI64I64D u1;
+  u1.d=x; //Casting to Double!
+  return (((u1.ui64<<1)>>53)-1022) & (((x!=0)<<31)>>31);
+}
+
+static inline int bitsNeeded_UI64(uint64_t x){
+  int shift;
+  int res=0;
+  
+  //Get the absolute value of x:
+  //x=(x^((x&(int64_t)0x8000000000000000)>>63))+((x&(int64_t)0x8000000000000000)!=0);
+  //x=abs_FastI64(x);
+  
+  //printf("%d\n",(x&(uint64_t)0xFFFFFFFF00000000)!=0);
+  shift=(((x&(uint64_t)0xFFFFFFFF00000000)!=0)*32);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x00000000FFFF0000)!=0);
+  shift=(((x&(uint64_t)0x00000000FFFF0000)!=0)*16);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x000000000000FF00)!=0);
+  shift=(((x&(uint64_t)0x000000000000FF00)!=0)*8);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x00000000000000F0)!=0);
+  shift=(((x&(uint64_t)0x00000000000000F0)!=0)*4);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x000000000000000C)!=0);
+  shift=(((x&(uint64_t)0x000000000000000C)!=0)*2);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x0000000000000002)!=0);
+  shift=((x&(uint64_t)0x0000000000000002)!=0);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x0000000000000001)!=0);
+  shift=((x&(uint64_t)0x0000000000000001)!=0);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("BITS NEEDED: %d\n",res);
+  return res;
+}
+
+static inline int bitsNeeded_I64(int64_t x){
+  uint64_t ux;
+  ux=abs_FastI64(x);
+  return bitsNeeded_UI64(ux);
+}
+
+//Implementations(They are inline, so they should be in this header file)
+
+static inline int myEndianType(){ //Should work for most cases. May not work at mixed endian systems.
+  uint64_t n=1;
+  if (*(unsigned char*)&n == 1){
+    //cout<<"Little-Endian"<<endl;
+    return 0;  //0 for little endian
+  }
+  else{
+    //cout<<"Big-Endian"<<endl;
+    return 1; //1 for big endian
+  }
+}
+
+static inline void flipBytes_UI64(uint64_t *dataPtr){
+  unsigned char*tempA;
+  char temp8b;
+  tempA=(unsigned char*)dataPtr;
+  temp8b=tempA[7];
+  tempA[7]=tempA[0];
+  tempA[0]=temp8b;
+  temp8b=tempA[6];
+  tempA[6]=tempA[1];
+  tempA[1]=temp8b;
+  temp8b=tempA[5];
+  tempA[5]=tempA[2];
+  tempA[2]=temp8b;
+  temp8b=tempA[4];
+  tempA[4]=tempA[3];
+  tempA[3]=temp8b;
+  return;
+}
+
+//WARNING: readBits works properly only on Little Endian machines! (For Big Endians, some modifications are needed)
+
+static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,char numBits){ // numBits must be in range [0:56]
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    //cout<<"bitPos:"<<(*bitPosPtr)<<"\tbitPos>>3:"<<(*bitPosPtr>>3)<<endl;
+    uint64_t temp64b = *(uint64_t*)(buffer + ( *bitPosPtr >> 3)); 
+    //NOTE: bitPos>>3 is the same as bitPos/8
+    temp64b >>= (*bitPosPtr) & (uint64_t)0x0000000000000007;
+    
+    //cout<<endl;
+    //cout<<"bitpos>>3:"<<(bitPos>>3)<<" bitPos&0x7:"<<(bitPos & 0x00000007)<<" bitPos%8:"<<(bitPos%8)<<endl;
+    //cout<<"Read:"<<(temp64b & mask)<<" temp64b:"<<temp64b<<" Mask:"<<mask<<" numBits:"<<numBits<<endl;
+    
+    (*bitPosPtr) += numBits;
+    return (temp64b & mask);
+}
+
+static inline int64_t readBits_I64(unsigned char* buffer,uint64_t *bitPosPtr,char numBits){ // numBits must be in range [0:56]
+  int64_t val;
+  val=readBits_UI64(buffer,bitPosPtr,numBits);//Read value
+  int64_t shiftAmount=64-numBits;
+  val=(val<<shiftAmount)>>shiftAmount;//Sign correction
+  return val;
+}
+
+//WARNING: readBits_EndianSafe is not tested on Big-Endian machines
+static inline uint64_t readBits_EndianSafe(unsigned char* buffer,uint64_t *bitPosPtr,char numBits){ // numBits must be in range [0:56]
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    uint64_t temp64b = *(uint64_t*)(buffer + ((*bitPosPtr)>>3)); 
+    //NOTE: (*bitPosPtr)>>3 is the same as (*bitPosPtr)/8
+    if(myEndianType())
+      flipBytes_UI64(&temp64b);
+    temp64b >>= (*bitPosPtr) & (uint64_t)0x0000000000000007;
+    (*bitPosPtr) += numBits;
+    return temp64b & mask;
+}
+
+//WARNING: writeBits_Fast works properly only on Little Endian machines! (For Big Endians, some modifications are needed)
+//The buffer should be initialized as 0's for this to work!
+//Also, the range of data is not checked!(If data exceeds numBits, it may be cause problems)
+static inline void writeBits_Fast(unsigned char* buffer,uint64_t *bitPosPtr,char numBits,int64_t data){
+    //if(DEBUG){printf("writeBits_Fast: data:0x%lx %ld\n",data,data);} //DEBUG
+    //if(DEBUG){printf("writeBits_Fast: numBits:0x%lx %ld\n",numBits,numBits);} //DEBUG
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    //if(DEBUG){printf("writeBits_Fast: mask:0x%lx %ld\n",mask,mask);} //DEBUG
+    //if(DEBUG){printf("writeBits_Fast: data&mask:0x%lx %ld\n",((*(uint64_t*)&data)&mask),((*(uint64_t*)&data)&mask));} //DEBUG
+    
+    //if(DEBUG){printf("writeBits_Fast: buffer_O:0x%lx\n",*(uint64_t*)(buffer + ((*bitPosPtr)>>3)));} //DEBUG
+    *(uint64_t*)(buffer + ((*bitPosPtr)>>3)) |= ((*(uint64_t*)&data)&mask) << ((*bitPosPtr) & (uint64_t)0x0000000000000007);
+    //if(DEBUG){printf("writeBits_Fast: buffer_N:0x%lx\n",*(uint64_t*)(buffer + ((*bitPosPtr)>>3)));} //DEBUG
+
+    
+    (*bitPosPtr) += numBits;
+}
+
+//WARNING: writeBits_EndianSafe is not tested on Big-Endian machines
+static inline void writeBits_EndianSafe(unsigned char* buffer,uint64_t *bitPosPtr,char numBits,uint64_t data){
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    data=data&mask;
+    uint64_t temp64b_inBuffer=*(uint64_t*)(buffer + ((*bitPosPtr)>>3));
+    uint64_t temp64b_outBuffer=data << ((*bitPosPtr) & (uint64_t)0x0000000000000007);
+    if(myEndianType()){
+      flipBytes_UI64(&temp64b_inBuffer);
+    }
+    temp64b_outBuffer |= temp64b_inBuffer;
+    if(myEndianType()){
+      flipBytes_UI64(&temp64b_outBuffer);
+    }
+    *(uint64_t*)(buffer + ((*bitPosPtr)>>3))=temp64b_outBuffer;  // "|=" may also work
+    (*bitPosPtr) += numBits;
+}
+
+
+#endif
--- a/deps/SZ/sz/include/rw.h
+++ b/deps/SZ/sz/include/rw.h
@ -0,0 +1,89 @@
+/**
+ *  @file io.h
+ *  @author Sheng Di
+ *  @date April, 2015
+ *  @brief Header file for the whole io interface.
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _IO_H
+#define _IO_H
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef _WIN32
+#define PATH_SEPARATOR ';'
+#else
+#define PATH_SEPARATOR ':'
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int checkFileExistance(char* filePath);
+
+float** create2DArray_float(size_t m, size_t n);
+void free2DArray_float(float** data, size_t m);
+float*** create3DArray_float(size_t p, size_t m, size_t n);
+void free3DArray_float(float*** data, size_t p, size_t m);
+double** create2DArray_double(size_t m, size_t n);
+void free2DArray_double(double** data, size_t m);
+double*** create3DArray_double(size_t p, size_t m, size_t n);
+void free3DArray_double(double*** data, size_t p, size_t m);
+size_t checkFileSize(char *srcFilePath, int *status);
+
+unsigned char *readByteData(char *srcFilePath, size_t *byteLength, int *status);
+double *readDoubleData(char *srcFilePath, size_t *nbEle, int *status);
+int8_t *readInt8Data(char *srcFilePath, size_t *nbEle, int *status);
+int16_t *readInt16Data(char *srcFilePath, size_t *nbEle, int *status);
+uint16_t *readUInt16Data(char *srcFilePath, size_t *nbEle, int *status);
+int32_t *readInt32Data(char *srcFilePath, size_t *nbEle, int *status);
+uint32_t *readUInt32Data(char *srcFilePath, size_t *nbEle, int *status);
+int64_t *readInt64Data(char *srcFilePath, size_t *nbEle, int *status);
+uint64_t *readUInt64Data(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData(char *srcFilePath, size_t *nbEle, int *status);
+unsigned short* readShortData(char *srcFilePath, size_t *dataLength, int *status);
+
+double *readDoubleData_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int8_t *readInt8Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int16_t *readInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint16_t *readUInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int32_t *readInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint32_t *readUInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int64_t *readInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint64_t *readUInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+
+void writeByteData(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status);
+void writeDoubleData(double *data, size_t nbEle, char *tgtFilePath, int *status);
+void writeFloatData(float *data, size_t nbEle, char *tgtFilePath, int *status);
+void writeData(void *data, int dataType, size_t nbEle, char *tgtFilePath, int *status);
+void writeFloatData_inBytes(float *data, size_t nbEle, char* tgtFilePath, int *status);
+void writeDoubleData_inBytes(double *data, size_t nbEle, char* tgtFilePath, int *status);
+void writeShortData_inBytes(short *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeUShortData_inBytes(unsigned short *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeIntData_inBytes(int *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeUIntData_inBytes(unsigned int *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeLongData_inBytes(int64_t *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeULongData_inBytes(uint64_t *states, size_t stateLength, char *tgtFilePath, int *status);
+
+void writeStrings(int nbStr, char *str[], char *tgtFilePath, int *status);
+
+//void convertToPFM_float(float *data, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, int endianType, char *tgtFilePath, int *status);
+
+void checkfilesizec_(char *srcFilePath, int *len, size_t *filesize);
+void readbytefile_(char *srcFilePath, int *len, unsigned char *bytes, size_t *byteLength);
+void readdoublefile_(char *srcFilePath, int *len, double *data, size_t *nbEle);
+void readfloatfile_(char *srcFilePath, int *len, float *data, size_t *nbEle);
+void writebytefile_(unsigned char *bytes, size_t *byteLength, char *tgtFilePath, int *len);
+void writedoublefile_(double *data, size_t *nbEle, char *tgtFilePath, int *len);
+void writefloatfile_(float *data, size_t *nbEle, char *tgtFilePath, int *len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _IO_H  ----- */
--- a/deps/SZ/sz/include/sz.h
+++ b/deps/SZ/sz/include/sz.h
@ -0,0 +1,337 @@
+/**
+ *  @file sz.h
+ *  @author Sheng Di
+ *  @date April, 2015
+ *  @brief Header file for the whole compressor.
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_H
+#define _SZ_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/time.h>      /* For gettimeofday(), in microseconds */
+#include <time.h>          /* For time(), in seconds */
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "VarSet.h"
+#include "Huffman.h"
+#include "TightDataPointStorageD.h"
+#include "TightDataPointStorageF.h"
+#include "TightDataPointStorageI.h"
+#include "conf.h"
+#include "dataCompression.h"
+#include "ByteToolkit.h"
+#include "TypeManager.h"
+#include "sz_int8.h"
+#include "sz_int16.h"
+#include "sz_int32.h"
+#include "sz_int64.h"
+#include "sz_uint8.h"
+#include "sz_uint16.h"
+#include "sz_uint32.h"
+#include "sz_uint64.h"
+#include "sz_float.h"
+#include "sz_double.h"
+#include "szd_int8.h"
+#include "szd_int16.h"
+#include "szd_int32.h"
+#include "szd_int64.h"
+#include "szd_uint8.h"
+#include "szd_uint16.h"
+#include "szd_uint32.h"
+#include "szd_uint64.h"
+#include "szd_float.h"
+#include "szd_double.h"
+#include "sz_float_pwr.h"
+#include "sz_double_pwr.h"
+#include "sz_opencl.h"
+#include "callZlib.h"
+#include "rw.h"
+#include "pastri.h"
+#include "sz_float_ts.h"
+#include "szd_float_ts.h"
+#include "utility.h"
+#include "CacheTable.h"
+#include "MultiLevelCacheTable.h"
+#include "MultiLevelCacheTableWideInterval.h"
+#include "exafelSZ.h"
+#include "sz_stats.h"
+
+#ifdef _WIN32
+#define PATH_SEPARATOR ';'
+#else
+#define PATH_SEPARATOR ':'
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//typedef char int8_t;
+//typedef unsigned char uint8_t;
+//typedef short int16_t;
+//typedef unsigned short uint16_t;
+//typedef int int32_t;
+//typedef unsigned int uint32_t;
+//typedef long int64_t;
+//typedef unsigned long uint64_t;
+
+#include "defines.h"
+	
+//Note: the following setting should be consistent with stateNum in Huffman.h
+//#define intvCapacity 65536
+//#define intvRadius 32768
+//#define intvCapacity 131072
+//#define intvRadius 65536
+
+#define SZ_COMPUTE_1D_NUMBER_OF_BLOCKS( COUNT, NUM_BLOCKS, BLOCK_SIZE ) \
+    if (COUNT <= BLOCK_SIZE){                  \
+        NUM_BLOCKS = 1;             \
+    }                                   \
+    else{                               \
+        NUM_BLOCKS = COUNT / BLOCK_SIZE;       \
+    }                                   \
+
+#define SZ_COMPUTE_2D_NUMBER_OF_BLOCKS( COUNT, NUM_BLOCKS, BLOCK_SIZE ) \
+    if (COUNT <= BLOCK_SIZE){                   \
+        NUM_BLOCKS = 1;             \
+    }                                   \
+    else{                               \
+        NUM_BLOCKS = COUNT / BLOCK_SIZE;        \
+    }                                   \
+
+#define SZ_COMPUTE_3D_NUMBER_OF_BLOCKS( COUNT, NUM_BLOCKS, BLOCK_SIZE ) \
+    if (COUNT <= BLOCK_SIZE){                   \
+        NUM_BLOCKS = 1;             \
+    }                                   \
+    else{                               \
+        NUM_BLOCKS = COUNT / BLOCK_SIZE;        \
+    }                                   \
+
+#define SZ_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX,       \
+                                       EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \
+    EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS;               \
+    SPLIT_INDEX = COUNT % NUM_BLOCKS;                                        \
+    if (0 != SPLIT_INDEX) {                                                  \
+        EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1;                           \
+    }                                                                        \
+
+//typedef unsigned long unsigned long;
+//typedef unsigned int uint;
+
+typedef union lint16
+{
+	unsigned short usvalue;
+	short svalue;
+	unsigned char byte[2];
+} lint16;
+
+typedef union lint32
+{
+	int ivalue;
+	unsigned int uivalue;
+	unsigned char byte[4];
+} lint32;
+
+typedef union lint64
+{
+	long lvalue;
+	unsigned long ulvalue;
+	unsigned char byte[8];
+} lint64;
+
+typedef union ldouble
+{
+    double value;
+    unsigned long lvalue;
+    unsigned char byte[8];
+} ldouble;
+
+typedef union lfloat
+{
+    float value;
+    unsigned int ivalue;
+    unsigned char byte[4];
+} lfloat;
+
+/* array meta data and compression parameters for SZ_Init_Params() */
+typedef struct sz_params
+{
+	int dataType;
+	unsigned int max_quant_intervals; //max number of quantization intervals for quantization
+	unsigned int quantization_intervals; 
+	unsigned int maxRangeRadius;
+	int sol_ID;// it's SZ or SZ_Transpose, unless the setting is PASTRI compression mode (./configure --enable-pastri)
+	int losslessCompressor;
+	int sampleDistance; //2 bytes
+	float predThreshold;  // 2 bytes
+	int szMode; //* 0 (best speed) or 1 (better compression with Zstd/Gzip) or 3 temporal-dimension based compression
+	int gzipMode; //* four options: Z_NO_COMPRESSION, or Z_BEST_SPEED, Z_BEST_COMPRESSION, Z_DEFAULT_COMPRESSION
+	int  errorBoundMode; //4bits (0.5byte), //ABS, REL, ABS_AND_REL, or ABS_OR_REL, PSNR, or PW_REL, PSNR
+	double absErrBound; //absolute error bound
+	double relBoundRatio; //value range based relative error bound ratio
+	double psnr; //PSNR
+	double normErr;
+	double pw_relBoundRatio; //point-wise relative error bound
+	int segment_size; //only used for 2D/3D data compression with pw_relBoundRatio (deprecated)
+	int pwr_type; //only used for 2D/3D data compression with pw_relBoundRatio
+	
+	int protectValueRange; //0 or 1
+	float fmin, fmax;
+	double dmin, dmax;
+	
+	int snapshotCmprStep; //perform single-snapshot-based compression if time_step == snapshotCmprStep
+	int predictionMode;
+
+	int accelerate_pw_rel_compression;
+	int plus_bits;
+	
+	int randomAccess;
+	int withRegression;
+	
+} sz_params;
+
+typedef struct sz_metadata
+{
+	int versionNumber[3]; //only used for checking the version by calling SZ_GetMetaData()
+	int isConstant; //only used for checking if the data are constant values by calling SZ_GetMetaData()
+	int isLossless; //only used for checking if the data compression was lossless, used only by calling SZ_GetMetaData()
+	int sizeType; //only used for checking whether the size type is "int" or "long" in the compression, used only by calling SZ_GetMetaData()
+	size_t dataSeriesLength; //# number of data points in the dataset
+	int defactoNBBins; //real number of quantization bins
+	struct sz_params* conf_params; //configuration parameters
+} sz_metadata;
+
+typedef struct sz_exedata
+{
+	char optQuantMode;	//opt Quantization (0: fixed ; 1: optimized)	
+	int intvCapacity; // the number of intervals for the linear-scaling quantization
+	int intvRadius;  // the number of intervals for the radius of the quantization range (intvRadius=intvCapacity/2)
+	unsigned int SZ_SIZE_TYPE; //the length (# bytes) of the size_t in the system at runtime //4 or 8: sizeof(size_t) 
+} sz_exedata;
+
+/*We use a linked list to maintain time-step meta info for time-step based compression*/
+typedef struct sz_tsc_metainfo
+{
+	int totalNumOfSteps;
+	int currentStep;
+	char metadata_filename[256];
+	FILE *metadata_file;
+	unsigned char* bit_array; //sihuan added
+	size_t intersect_size; //sihuan added
+	int64_t* hist_index; //sihuan added: prestep index 
+
+} sz_tsc_metadata;
+
+extern int versionNumber[4];
+
+//-------------------key global variables--------------
+extern int dataEndianType; //*endian type of the data read from disk
+extern int sysEndianType; //*sysEndianType is actually set automatically.
+
+extern sz_params *confparams_cpr;
+extern sz_params *confparams_dec;
+extern sz_exedata *exe_params;
+
+//------------------------------------------------
+extern SZ_VarSet* sz_varset;
+extern sz_multisteps *multisteps; //compression based on multiple time steps (time-dimension based compression)
+extern sz_tsc_metadata *sz_tsc;
+
+//for pastri 
+#ifdef PASTRI
+extern pastri_params pastri_par; 
+#endif
+
+//sz.h
+HuffmanTree* SZ_Reset();
+
+int SZ_Init(const char *configFilePath);
+
+int SZ_Init_Params(sz_params *params);
+
+size_t computeDataLength(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int computeDimension(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_compress_args_float_subblock(unsigned char* compressedBytes, float *oriData,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1,
+size_t *outSize, int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_double_subblock(unsigned char* compressedBytes, double *oriData,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1,
+size_t *outSize, int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+unsigned char *SZ_compress(int dataType, void *data, size_t *outSize, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+unsigned char* SZ_compress_args(int dataType, void *data, size_t *outSize, int errBoundMode, double absErrBound, 
+double relBoundRatio, double pwrBoundRatio, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_compress_args2(int dataType, void *data, unsigned char* compressed_bytes, size_t *outSize, 
+int errBoundMode, double absErrBound, double relBoundRatio, double pwrBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_compress_args3(int dataType, void *data, unsigned char* compressed_bytes, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1);
+
+unsigned char *SZ_compress_rev_args(int dataType, void *data, void *reservedValue, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_compress_rev_args2(int dataType, void *data, void *reservedValue, unsigned char* compressed_bytes, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+unsigned char *SZ_compress_rev(int dataType, void *data, void *reservedValue, size_t *outSize, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+void SZ_Create_ParamsExe(sz_params** conf_params, sz_exedata** exe_params);
+
+void *SZ_decompress(int dataType, unsigned char *bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+size_t SZ_decompress_args(int dataType, unsigned char *bytes, size_t byteLength, void* decompressed_array, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+sz_metadata* SZ_getMetadata(unsigned char* bytes);
+void SZ_printMetadata(sz_metadata* metadata);
+
+
+void filloutDimArray(size_t* dim, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+size_t compute_total_batch_size();
+
+void SZ_registerVar(int var_id, char* varName, int dataType, void* data, 
+			int errBoundMode, double absErrBound, double relBoundRatio, double pwRelBoundRatio, 
+			size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_deregisterVar_ID(int var_id);
+int SZ_deregisterVar(char* varName);
+int SZ_deregisterAllVars();
+
+int SZ_compress_ts_select_var(int cmprType, unsigned char* var_ids, unsigned char var_count, unsigned char** newByteData, size_t *outSize);
+int SZ_compress_ts(int cmprType, unsigned char** newByteData, size_t *outSize);
+void SZ_decompress_ts_select_var(unsigned char* var_ids, unsigned char var_count, unsigned char *bytes, size_t bytesLength);
+void SZ_decompress_ts(unsigned char *bytes, size_t byteLength);
+
+void SZ_Finalize();
+
+void convertSZParamsToBytes(sz_params* params, unsigned char* result);
+void convertBytesToSZParams(unsigned char* bytes, sz_params* params);
+
+unsigned char* SZ_compress_customize(const char* appName, void* userPara, int dataType, void* data, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, int *status);
+
+unsigned char* SZ_compress_customize_threadsafe(const char* cmprName, void* userPara, int dataType, void* data, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, int *status);
+
+void* SZ_decompress_customize(const char* appName, void* userPara, int dataType, unsigned char* bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, int* status);
+
+void* SZ_decompress_customize_threadsafe(const char* cmprName, void* userPara, int dataType, unsigned char* bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, int *status);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_H  ----- */
--- a/deps/SZ/sz/include/sz_double.h
+++ b/deps/SZ/sz/include/sz_double.h
@ -0,0 +1,100 @@
+/**
+ *  @file sz_double.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_double.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Double_H
+#define _SZ_Double_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+unsigned char* SZ_skip_compress_double(double* data, size_t dataLength, size_t* outSize);
+
+void computeReqLength_double(double realPrecision, short radExpo, int* reqLength, double* medianValue);
+short computeReqLength_double_MSST19(double realPrecision);
+
+unsigned int optimize_intervals_double_1D(double *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_double_2D(double *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_double_3D(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_double_4D(double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+
+unsigned int optimize_intervals_double_3D_opt(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_double_2D_opt(double *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_double_1D_opt(double *oriData, size_t dataLength, double realPrecision);
+
+size_t SZ_compress_double_3D_MDQ_RA_block(double * block_ori_data, double * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, double * P0, double * P1, int * type, double * unpredictable_data);
+
+unsigned int optimize_intervals_double_1D_opt_MSST19(double *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_double_2D_opt_MSST19(double *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_double_3D_opt_MSST19(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+TightDataPointStorageD* SZ_compress_double_1D_MDQ(double *oriData, 
+size_t dataLength, double realPrecision, double valueRangeSize, double medianValue_d);
+void SZ_compress_args_double_StoreOriData(double* oriData, size_t dataLength, unsigned char** newByteData, size_t *outSize);
+
+char SZ_compress_args_double_NoCkRngeNoGzip_1D(int cmprType, unsigned char** newByteData, double *oriData, size_t dataLength, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d);
+
+TightDataPointStorageD* SZ_compress_double_2D_MDQ(double *oriData, size_t r1, size_t r2, double realPrecision, double valueRangeSize, double medianValue_d);
+char SZ_compress_args_double_NoCkRngeNoGzip_2D(int cmprType, unsigned char** newByteData, double *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d);
+
+TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, double valueRangeSize, double medianValue_d);
+char SZ_compress_args_double_NoCkRngeNoGzip_3D(int cmprType, unsigned char** newByteData, double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d);
+
+TightDataPointStorageD* SZ_compress_double_4D_MDQ(double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, double valueRangeSize, double medianValue_d);
+char SZ_compress_args_double_NoCkRngeNoGzip_4D(unsigned char** newByteData, double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d);
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_MSST19(double *oriData, size_t dataLength, double realPrecision, double valueRangeSize, double medianValue_f);
+TightDataPointStorageD* SZ_compress_double_2D_MDQ_MSST19(double *oriData, size_t r1, size_t r2, double realPrecision, double valueRangeSize, double medianValue_f);
+TightDataPointStorageD* SZ_compress_double_3D_MDQ_MSST19(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, double valueRangeSize, double medianValue_f);
+
+void SZ_compress_args_double_withinRange(unsigned char** newByteData, double *oriData, size_t dataLength, size_t *outSize);
+
+/*int SZ_compress_args_double_wRngeNoGzip(unsigned char** newByteData, double *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwrErrRatio);*/
+
+int SZ_compress_args_double(int cmprType, int withRegression, unsigned char** newByteData, double *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRatio);
+
+void SZ_compress_args_double_NoCkRnge_1D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r1, size_t s1, size_t e1);
+void SZ_compress_args_double_NoCkRnge_2D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r2, size_t r1, size_t s2, size_t s1, size_t e2, size_t e1);
+void SZ_compress_args_double_NoCkRnge_3D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r3, size_t r2, size_t r1, size_t s3, size_t s2, size_t s1, size_t e3, size_t e2, size_t e1);
+void SZ_compress_args_double_NoCkRnge_4D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r4, size_t r3, size_t r2, size_t r1, size_t s4, size_t s3, size_t s2, size_t s1, size_t e4, size_t e3, size_t e2, size_t e1);
+
+unsigned int optimize_intervals_double_1D_subblock(double *oriData, double realPrecision, size_t r1, size_t s1, size_t e1);
+unsigned int optimize_intervals_double_2D_subblock(double *oriData, double realPrecision, size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2);
+unsigned int optimize_intervals_double_3D_subblock(double *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3);
+unsigned int optimize_intervals_double_4D_subblock(double *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t s1, size_t e1);
+TightDataPointStorageD* SZ_compress_double_2D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2);
+TightDataPointStorageD* SZ_compress_double_3D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3);
+TightDataPointStorageD* SZ_compress_double_4D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
+
+unsigned int optimize_intervals_double_2D_with_freq_and_dense_pos(double *oriData, size_t r1, size_t r2, double realPrecision, double * dense_pos, double * max_freq, double * mean_freq);
+unsigned int optimize_intervals_double_3D_with_freq_and_dense_pos(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, double * dense_pos, double * max_freq, double * mean_freq);
+unsigned char * SZ_compress_double_2D_MDQ_nonblocked_with_blocked_regression(double *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_double_3D_MDQ_nonblocked_with_blocked_regression(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Double_H  ----- */
+
--- a/deps/SZ/sz/include/sz_double_pwr.h
+++ b/deps/SZ/sz/include/sz_double_pwr.h
@ -0,0 +1,57 @@
+/**
+ *  @file sz_double.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_double.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Double_PWR_H
+#define _SZ_Double_PWR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdbool.h>
+
+void compute_segment_precisions_double_1D(double *oriData, size_t dataLength, double* pwrErrBound, unsigned char* pwrErrBoundBytes, double globalPrecision);
+unsigned int optimize_intervals_double_1D_pwr(double *oriData, size_t dataLength, double* pwrErrBound); 
+void compute_segment_precisions_double_2D(double *oriData, double* pwrErrBound, 
+size_t r1, size_t r2, size_t R2, size_t edgeSize, unsigned char* pwrErrBoundBytes, double Min, double Max, double globalPrecision);
+unsigned int optimize_intervals_double_2D_pwr(double *oriData, size_t r1, size_t r2, size_t R2, size_t edgeSize, double* pwrErrBound);
+void compute_segment_precisions_double_3D(double *oriData, double* pwrErrBound, 
+size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, unsigned char* pwrErrBoundBytes, double Min, double Max, double globalPrecision);
+unsigned int optimize_intervals_double_3D_pwr(double *oriData, size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, double* pwrErrBound);
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, size_t dataLength, size_t *outSize, double min, double max);
+void SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, size_t r1, size_t r2,
+size_t *outSize, double min, double max);
+void SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, 
+size_t r1, size_t r2, size_t r3, size_t *outSize, double min, double max);
+
+void createRangeGroups_double(double** posGroups, double** negGroups, int** posFlags, int** negFlags);
+void compressGroupIDArray_double(char* groupID, TightDataPointStorageD* tdps);
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_pwrGroup(double* oriData, size_t dataLength, int errBoundMode, 
+double absErrBound, double relBoundRatio, double pwrErrRatio, double valueRangeSize, double medianValue_f);
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(unsigned char** newByteData, double *oriData,
+size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio, double valueRangeSize, double medianValue_f, size_t *outSize);
+
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_pre_log(unsigned char** newByteData, double *oriData, double globalPrecision, size_t dataLength, size_t *outSize, double min, double max);
+void SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr_pre_log(unsigned char** newByteData, double *oriData, double globalPrecision, size_t r1, size_t r2, size_t *outSize, double min, double max);
+void SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log(unsigned char** newByteData, double *oriData, double globalPrecision, size_t r1, size_t r2, size_t r3, size_t *outSize, double min, double max);
+
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_pre_log_MSST19(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, double valueRangeSize, double medianValue_f,
+																unsigned char* signs, bool* positive, double min, double max, double nearZero);
+void SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr_pre_log_MSST19(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t *outSize, double valueRangeSize,
+																unsigned char* signs, bool* positive, double min, double max, double nearZero);
+void SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_pre_log_MSST19(unsigned char** newByteData, double *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t r3, size_t *outSize, double valueRangeSize, 
+																unsigned char* signs, bool* positive, double min, double max, double nearZero);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Double_PWR_H  ----- */
+
--- a/deps/SZ/sz/include/sz_double_ts.h
+++ b/deps/SZ/sz/include/sz_double_ts.h
@ -0,0 +1,27 @@
+/**
+ *  @file sz_double_ts.h
+ *  @author Sheng Di
+ *  @date May, 2018
+ *  @brief Header file for the sz_double_ts.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "TightDataPointStorageD.h"
+
+#ifndef _SZ_Double_TS_H
+#define _SZ_Double_TS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+unsigned int optimize_intervals_double_1D_ts(double *oriData, size_t dataLength, double* preData, double realPrecision);
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_ts(double *oriData, size_t dataLength, sz_multisteps* multisteps,
+double realPrecision, double valueRangeSize, double medianValue_d);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Double_TS_H  ----- */
+
--- a/deps/SZ/sz/include/sz_float.h
+++ b/deps/SZ/sz/include/sz_float.h
@ -0,0 +1,153 @@
+/**
+ *  @file sz_float.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_float.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "DynamicFloatArray.h"
+
+#ifndef _SZ_Float_H
+#define _SZ_Float_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+unsigned char* SZ_skip_compress_float(float* data, size_t dataLength, size_t* outSize);
+
+void computeReqLength_float(double realPrecision, short radExpo, int* reqLength, float* medianValue);
+short computeReqLength_float_MSST19(double realPrecision);
+
+unsigned int optimize_intervals_float_1D(float *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_float_2D(float *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_float_3D(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_float_4D(float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+
+unsigned int optimize_intervals_and_compute_dense_position_float_1D(float *oriData, size_t dataLength, double realPrecision, float * dense_pos);
+unsigned int optimize_intervals_and_compute_dense_position_float_3D(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float * dense_pos);
+unsigned int optimize_intervals_float_3D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq);
+unsigned int optimize_intervals_float_3D_opt(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_float_2D_opt(float *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_float_1D_opt(float *oriData, size_t dataLength, double realPrecision);
+
+unsigned int optimize_intervals_float_1D_opt_MSST19(float *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_float_2D_opt_MSST19(float *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_float_3D_opt_MSST19(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ(float *oriData, 
+size_t dataLength, float realPrecision, float valueRangeSize, float medianValue_f);
+
+void SZ_compress_args_float_StoreOriData(float* oriData, size_t dataLength, unsigned char** newByteData, size_t *outSize);
+
+char SZ_compress_args_float_NoCkRngeNoGzip_1D(int cmprType, unsigned char** newByteData, float *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f);
+
+TightDataPointStorageF* SZ_compress_float_2D_MDQ(float *oriData, size_t r1, size_t r2, float realPrecision, float valueRangeSize, float medianValue_f);
+
+char SZ_compress_args_float_NoCkRngeNoGzip_2D(int cmprType, unsigned char** newByteData, float *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f);
+
+TightDataPointStorageF* SZ_compress_float_3D_MDQ(float *oriData, size_t r1, size_t r2, size_t r3, float realPrecision, float valueRangeSize, float medianValue_f);
+
+char SZ_compress_args_float_NoCkRngeNoGzip_3D(int cmprType, unsigned char** newByteData, float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f);
+
+size_t SZ_compress_float_1D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t block_dim_0, double realPrecision, int * type, float * unpredictable_data);
+size_t SZ_compress_float_2D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t block_dim_0, size_t block_dim_1, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+
+size_t SZ_compress_float_1D_MDQ_RA_block_1D_pred(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t block_dim_0, double realPrecision, int * type, DynamicFloatArray * unpredictable_data);
+size_t SZ_compress_float_2D_MDQ_RA_block_2D_pred(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t dim_1, size_t block_dim_0, size_t block_dim_1, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, float realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block_3D_pred(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block_adaptive(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+//unsigned short SZ_compress_float_3D_MDQ_RA_block_1D_pred(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t dim_1, size_t dim_2, int block_dim_0, int block_dim_1, int block_dim_2, double realPrecision, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block_3D_pred_flush_after_compare(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block_2_layers(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, float * P_, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_pred_by_regression(float * block_ori_data, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * reg_params, int * type, float * unpredictable_data);
+void SZ_blocked_regression(float * block_ori_data, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, float *params);
+unsigned char * SZ_compress_float_3D_MDQ_RA_all_by_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+float SZ_compress_float_3D_MDQ_RA_block_no_mean(float * block_ori_data, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, unsigned short * unpred_count, float * unpredictable_data);
+float SZ_compress_float_3D_MDQ_pred_by_regression_with_err(float * block_ori_data, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * reg_params, int * type, unsigned short * unpred_count, float * unpredictable_data);
+unsigned char * SZ_compress_float_3D_MDQ_RA_blocked_with_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+void decompressDataSeries_float_3D_RA_blocked_with_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+
+unsigned char * SZ_compress_float_1D_MDQ_RA(float *oriData, size_t r1, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_2D_MDQ_RA(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_2D_MDQ_nonblocked(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_RA(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_ori(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_multi_means(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_RA_multi_means(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_adaptive(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+
+unsigned char * SZ_compress_float_2D_MDQ_decompression_random_access_with_blocked_regression(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_1D_MDQ_decompression_random_access_with_blocked_regression(float *oriData, size_t r1, double realPrecision, size_t * comp_size);
+
+TightDataPointStorageF* SZ_compress_float_4D_MDQ(float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, float valueRangeSize, float medianValue_f);
+
+char SZ_compress_args_float_NoCkRngeNoGzip_4D(unsigned char** newByteData, float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f);
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_MSST19(float *oriData, 
+size_t dataLength, double realPrecision, float valueRangeSize, float medianValue_f);
+TightDataPointStorageF* SZ_compress_float_2D_MDQ_MSST19(float *oriData, size_t r1, size_t r2, double realPrecision, float valueRangeSize, float medianValue_f);
+TightDataPointStorageF* SZ_compress_float_3D_MDQ_MSST19(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float valueRangeSize, float medianValue_f);
+
+void SZ_compress_args_float_withinRange(unsigned char** newByteData, float *oriData, size_t dataLength, size_t *outSize);
+
+/*int SZ_compress_args_float_wRngeNoGzip(unsigned char** newByteData, float *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwrErrRatio);*/
+
+int SZ_compress_args_float(int cmprType, int withRegression, unsigned char** newByteData, float *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRatio);
+
+int SZ_compress_args_float_subblock(unsigned char* compressedBytes, float *oriData,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1,
+size_t *outSize, int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+void SZ_compress_args_float_NoCkRnge_1D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r1, size_t s1, size_t e1); 
+
+void SZ_compress_args_float_NoCkRnge_2D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r2, size_t r1, size_t s2, size_t s1, size_t e2, size_t e1); 
+
+void SZ_compress_args_float_NoCkRnge_3D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r3, size_t r2, size_t r1, size_t s3, size_t s2, size_t s1, size_t e3, size_t e2, size_t e1); 
+
+void SZ_compress_args_float_NoCkRnge_4D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r4, size_t r3, size_t r2, size_t r1, size_t s4, size_t s3, size_t s2, size_t s1, size_t e4, size_t e3, size_t e2, size_t e1);
+
+unsigned int optimize_intervals_float_1D_subblock(float *oriData, double realPrecision, size_t r1, size_t s1, size_t e1); 
+unsigned int optimize_intervals_float_2D_subblock(float *oriData, double realPrecision, size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2); 
+unsigned int optimize_intervals_float_3D_subblock(float *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3); 
+unsigned int optimize_intervals_float_4D_subblock(float *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t s1, size_t e1); 
+
+TightDataPointStorageF* SZ_compress_float_2D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2); 
+
+TightDataPointStorageF* SZ_compress_float_3D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3); 
+
+TightDataPointStorageF* SZ_compress_float_4D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
+
+
+unsigned int optimize_intervals_float_2D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq);
+unsigned int optimize_intervals_float_3D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq);
+
+unsigned char * SZ_compress_float_2D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, float realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, float realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_random_access_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_decompression_random_access_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Float_H  ----- */
+
--- a/deps/SZ/sz/include/sz_float_pwr.h
+++ b/deps/SZ/sz/include/sz_float_pwr.h
@ -0,0 +1,66 @@
+/**
+ *  @file sz_float.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_float.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Float_PWR_H
+#define _SZ_Float_PWR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdbool.h>
+
+void compute_segment_precisions_float_1D(float *oriData, size_t dataLength, float* pwrErrBound, unsigned char* pwrErrBoundBytes, double globalPrecision);
+unsigned int optimize_intervals_float_1D_pwr(float *oriData, size_t dataLength, float* pwrErrBound);
+
+void compute_segment_precisions_float_2D(float *oriData, float* pwrErrBound, 
+size_t r1, size_t r2, size_t R2, size_t edgeSize, unsigned char* pwrErrBoundBytes, float Min, float Max, double globalPrecision);
+
+unsigned int optimize_intervals_float_2D_pwr(float *oriData, size_t r1, size_t r2, size_t R2, size_t edgeSize, float* pwrErrBound); 
+
+void compute_segment_precisions_float_3D(float *oriData, float* pwrErrBound, 
+size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, unsigned char* pwrErrBoundBytes, float Min, float Max, double globalPrecision);
+
+unsigned int optimize_intervals_float_3D_pwr(float *oriData, size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, float* pwrErrBound);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, size_t dataLength, size_t *outSize, float min, float max);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, size_t r1, size_t r2, 
+size_t *outSize, float min, float max);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, size_t r1, size_t r2, 
+size_t r3, size_t *outSize, float min, float max);
+
+void createRangeGroups_float(float** posGroups, float** negGroups, int** posFlags, int** negFlags);
+void compressGroupIDArray_float(char* groupID, TightDataPointStorageF* tdps);
+int* generateGroupLowerBounds();
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_pwrGroup(float* oriData, size_t dataLength, int errBoundMode, 
+double absErrBound, double relBoundRatio, double pwrErrRatio, float valueRangeSize, float medianValue_f);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(unsigned char** newByteData, float *oriData,
+size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio, float valueRangeSize, float medianValue_f, size_t *outSize);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, float min, float max);
+void SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t *outSize, float min, float max);
+void SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t r3, size_t *outSize, float min, float max);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_pre_log_MSST19(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t dataLength, size_t *outSize, float valueRangeSize, float medianValue_f,
+																unsigned char* signs, bool* positive, float min, float max, float nearZero);
+void SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_pre_log_MSST19(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t *outSize, float valueRangeSize,
+																unsigned char* signs, bool* positive, float min, float max, float nearZero);																
+void SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_pre_log_MSST19(unsigned char** newByteData, float *oriData, double pwrErrRatio, size_t r1, size_t r2, size_t r3, size_t *outSize, float valueRangeSize, 
+																unsigned char* signs, bool* positive, float min, float max, float nearZero);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Float_PWR_H  ----- */
+
--- a/deps/SZ/sz/include/sz_float_ts.h
+++ b/deps/SZ/sz/include/sz_float_ts.h
@ -0,0 +1,27 @@
+/**
+ *  @file sz_float_ts.h
+ *  @author Sheng Di
+ *  @date May, 2018
+ *  @brief Header file for the sz_float_ts.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "TightDataPointStorageF.h"
+
+#ifndef _SZ_Float_TS_H
+#define _SZ_Float_TS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+unsigned int optimize_intervals_float_1D_ts(float *oriData, size_t dataLength, float* preData, double realPrecision);
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_ts(float *oriData, size_t dataLength, sz_multisteps* multisteps,
+double realPrecision, float valueRangeSize, float medianValue_f);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Float_TS_H  ----- */
+
--- a/deps/SZ/sz/include/sz_int16.h
+++ b/deps/SZ/sz/include/sz_int16.h
@ -0,0 +1,48 @@
+/**
+ *  @file sz_int16.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_int16.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Int16_H
+#define _SZ_Int16_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_int16_1D(int16_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_int16_2D(int16_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_int16_3D(int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_int16_4D(int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_int16_1D_MDQ(int16_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int16_StoreOriData(int16_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_int16_NoCkRngeNoGzip_1D(unsigned char** newByteData, int16_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int16_t minValue);
+TightDataPointStorageI* SZ_compress_int16_2D_MDQ(int16_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int16_3D_MDQ(int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int16_NoCkRngeNoGzip_3D(unsigned char** newByteData, int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int16_4D_MDQ(int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int16_NoCkRngeNoGzip_4D(unsigned char** newByteData, int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int16_withinRange(unsigned char** newByteData, int16_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_int16_wRngeNoGzip(unsigned char** newByteData, int16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_int16(unsigned char** newByteData, int16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Int16_H  ----- */
+
--- a/deps/SZ/sz/include/sz_int32.h
+++ b/deps/SZ/sz/include/sz_int32.h
@ -0,0 +1,48 @@
+/**
+ *  @file sz_int32.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_int32.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Int32_H
+#define _SZ_Int32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_int32_1D(int32_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_int32_2D(int32_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_int32_3D(int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_int32_4D(int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_int32_1D_MDQ(int32_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int32_StoreOriData(int32_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_int32_NoCkRngeNoGzip_1D(unsigned char** newByteData, int32_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int32_t minValue);
+TightDataPointStorageI* SZ_compress_int32_2D_MDQ(int32_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int32_3D_MDQ(int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int32_NoCkRngeNoGzip_3D(unsigned char** newByteData, int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int32_4D_MDQ(int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int32_NoCkRngeNoGzip_4D(unsigned char** newByteData, int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int32_withinRange(unsigned char** newByteData, int32_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_int32_wRngeNoGzip(unsigned char** newByteData, int32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_int32(unsigned char** newByteData, int32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Int32_H  ----- */
+
--- a/deps/SZ/sz/include/sz_int64.h
+++ b/deps/SZ/sz/include/sz_int64.h
@ -0,0 +1,48 @@
+/**
+ *  @file sz_int64.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_int64.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Int64_H
+#define _SZ_Int64_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_int64_1D(int64_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_int64_2D(int64_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_int64_3D(int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_int64_4D(int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_int64_1D_MDQ(int64_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int64_StoreOriData(int64_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_int64_NoCkRngeNoGzip_1D(unsigned char** newByteData, int64_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int64_2D_MDQ(int64_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int64_3D_MDQ(int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int64_NoCkRngeNoGzip_3D(unsigned char** newByteData, int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int64_4D_MDQ(int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int64_NoCkRngeNoGzip_4D(unsigned char** newByteData, int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int64_withinRange(unsigned char** newByteData, int64_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_int64_wRngeNoGzip(unsigned char** newByteData, int64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_int64(unsigned char** newByteData, int64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Int64_H  ----- */
+
--- a/deps/SZ/sz/include/sz_int8.h
+++ b/deps/SZ/sz/include/sz_int8.h
@ -0,0 +1,48 @@
+/**
+ *  @file sz_int8.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_int8.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Int8_H
+#define _SZ_Int8_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_int8_1D(int8_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_int8_2D(int8_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_int8_3D(int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_int8_4D(int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_int8_1D_MDQ(int8_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int8_StoreOriData(int8_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_int8_NoCkRngeNoGzip_1D(unsigned char** newByteData, int8_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int8_t minValue);
+TightDataPointStorageI* SZ_compress_int8_2D_MDQ(int8_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int8_3D_MDQ(int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int8_NoCkRngeNoGzip_3D(unsigned char** newByteData, int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int8_4D_MDQ(int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int8_NoCkRngeNoGzip_4D(unsigned char** newByteData, int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int8_withinRange(unsigned char** newByteData, int8_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_int8_wRngeNoGzip(unsigned char** newByteData, int8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_int8(unsigned char** newByteData, int8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Int8_H  ----- */
+
--- a/deps/SZ/sz/include/sz_omp.h
+++ b/deps/SZ/sz/include/sz_omp.h
@ -0,0 +1,47 @@
+/**
+ *  @file sz_omp.h
+ *  @author Xin Liang
+ *  @date July, 2017
+ *  @brief Header file for the sz_omp.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+#include "sz.h"
+
+#ifndef _SZ_OMP_H
+#define _SZ_OMP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned char * SZ_compress_float_1D_MDQ_openmp(float *oriData, size_t r1, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_2D_MDQ_openmp(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_openmp(float *oriData, size_t r1, size_t r2, size_t r3, float realPrecision, size_t * comp_size);
+
+void decompressDataSeries_float_1D_openmp(float** data, size_t r1, unsigned char* comp_data);
+void decompressDataSeries_float_3D_openmp(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+void decompressDataSeries_float_2D_openmp(float** data, size_t r1, size_t r2, unsigned char* comp_data);
+
+unsigned char * SZ_compress_double_1D_MDQ_openmp(double *oriData, size_t r1, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_double_2D_MDQ_openmp(double *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_double_3D_MDQ_openmp(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+
+void decompressDataSeries_double_1D_openmp(double** data, size_t r1, unsigned char* comp_data);
+void decompressDataSeries_double_2D_openmp(double** data, size_t r1, size_t r2, unsigned char* comp_data);
+void decompressDataSeries_double_3D_openmp(double** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+
+//void Huffman_init_openmp(HuffmanTree* huffmanTree, int *s, size_t length, int thread_num);
+void Huffman_init_openmp(HuffmanTree* huffmanTree, int *s, size_t length, int thread_num, size_t * freq);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_OMP_H  ----- */
--- a/deps/SZ/sz/include/sz_opencl.h
+++ b/deps/SZ/sz/include/sz_opencl.h
@ -0,0 +1,68 @@
+//make header C++/C inter-operable
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef SZ_OPENCL_H
+#define SZ_OPENCL_H
+
+#include<stddef.h>
+
+	//opaque pointer for opencl state
+  struct sz_opencl_state;
+
+  /**
+   * creates an opencl state for multiple uses of the compressor or
+   * returns an error code.
+   *
+   * \post if return code is SZ_NCES, the state object may only be passed to
+   * sz_opencl_release or sz_opencl_error_* otherwise it may be used in any
+   * sz_opencl_* function.
+   *
+   * \param[out] state the sz opencl state
+   * \return SZ_SCES for success or SZ_NCES on error
+   */
+  int sz_opencl_init(struct sz_opencl_state** state);
+
+	/**
+	 * deinitializes an opencl state
+	 *
+	 * \param[in] state the sz opencl state
+	 * \return SZ_SCES
+	 */
+  int sz_opencl_release(struct sz_opencl_state** state);
+
+	/**
+	 * returns a human readable error message for the last error recieved by state
+	 *
+	 * \param[in] state the sz opencl state
+	 * \return a pointer to a string that describes the error
+	 */
+	const char* sz_opencl_error_msg(struct sz_opencl_state* state);
+
+
+	/**
+	 * returns a numeric code for the last error recieved by state
+	 *
+	 * \param[in] state the sz opencl state
+	 * \return the numeric error code
+	 */
+  int sz_opencl_error_code(struct sz_opencl_state* state);
+
+	/**
+	 * confirms that the sz opencl state is ready to use by performing a vector addition
+	 *
+	 * \param[in] state the sz opencl state
+	 * \return SZ_SCES if the opencl implementation is functioning
+	 */
+	int sz_opencl_check(struct sz_opencl_state*);
+
+  unsigned char* sz_compress_float3d_opencl(float* data, size_t r1, size_t r2, size_t r3, double, size_t* out_size);
+
+
+#endif /* SZ_OPENCL_H */
+
+//make header C++/C inter-operable
+#ifdef __cplusplus
+}
+#endif
--- a/deps/SZ/sz/include/sz_stats.h
+++ b/deps/SZ/sz/include/sz_stats.h
@ -0,0 +1,58 @@
+/**
+ *  @file ByteToolkit.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the ByteToolkit.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _STATS_H
+#define _STATS_H
+
+#include <stdint.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct sz_stats
+{
+	int use_mean;
+	
+	size_t blockSize;
+	
+	float lorenzoPercent;
+	float regressionPercent;
+	size_t lorenzoBlocks;
+	size_t regressionBlocks;
+	size_t totalBlocks;
+	
+	//size_t huffmanTreeHeight;
+	size_t huffmanTreeSize; //before the final zstd
+	size_t huffmanCodingSize; //before the final zstd
+	float huffmanCompressionRatio;
+	int huffmanNodeCount;
+		
+	size_t unpredictCount;
+	float unpredictPercent;
+	
+	float zstdCompressionRatio; //not available yet
+	
+} sz_stats;
+
+extern sz_stats sz_stat;
+
+
+void writeBlockInfo(int use_mean, size_t blockSize, size_t regressionBlocks, size_t totalBlocks);
+void writeHuffmanInfo(size_t huffmanTreeSize, size_t huffmanCodingSize, size_t totalDataSize, int huffmanNocdeCount);
+void writeZstdCompressionRatio(float zstdCompressionRatio);
+void writeUnpredictDataCounts(size_t unpredictCount, size_t totalNumElements);
+void printSZStats();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _STATS_H  ----- */
--- a/deps/SZ/sz/include/sz_uint16.h
+++ b/deps/SZ/sz/include/sz_uint16.h
@ -0,0 +1,48 @@
+/**
+ *  @file sz_uint16.h
+ *  @author Sheng Di
+ *  @date Nov, 2017
+ *  @brief Header file for the sz_uint16.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_UInt16_H
+#define _SZ_UInt16_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_uint16_1D(uint16_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_uint16_2D(uint16_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_uint16_3D(uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_uint16_4D(uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_uint16_1D_MDQ(uint16_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint16_StoreOriData(uint16_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_uint16_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint16_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint16_t minValue);
+TightDataPointStorageI* SZ_compress_uint16_2D_MDQ(uint16_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint16_3D_MDQ(uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint16_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint16_4D_MDQ(uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint16_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint16_withinRange(unsigned char** newByteData, uint16_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_uint16_wRngeNoGzip(unsigned char** newByteData, uint16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_uint16(unsigned char** newByteData, uint16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_UInt16_H  ----- */
+
--- a/deps/SZ/sz/include/sz_uint32.h
+++ b/deps/SZ/sz/include/sz_uint32.h
@ -0,0 +1,48 @@
+/**
+ *  @file sz_uint32.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_uint32.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_UInt32_H
+#define _SZ_UInt32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_uint32_1D(uint32_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_uint32_2D(uint32_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_uint32_3D(uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_uint32_4D(uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_uint32_1D_MDQ(uint32_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint32_StoreOriData(uint32_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_uint32_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint32_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint32_t minValue);
+TightDataPointStorageI* SZ_compress_uint32_2D_MDQ(uint32_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint32_3D_MDQ(uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint32_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint32_4D_MDQ(uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint32_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint32_withinRange(unsigned char** newByteData, uint32_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_uint32_wRngeNoGzip(unsigned char** newByteData, uint32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_uint32(unsigned char** newByteData, uint32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_UInt32_H  ----- */
+
--- a/deps/SZ/sz/include/sz_uint64.h
+++ b/deps/SZ/sz/include/sz_uint64.h
@ -0,0 +1,48 @@
+/**
+ *  @file sz_uint64.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_uint64.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_UInt64_H
+#define _SZ_UInt64_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_uint64_1D(uint64_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_uint64_2D(uint64_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_uint64_3D(uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_uint64_4D(uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_uint64_1D_MDQ(uint64_t *oriData, size_t dataLength, double realPrecision, uint64_t valueRangeSize, uint64_t minValue);
+void SZ_compress_args_uint64_StoreOriData(uint64_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_uint64_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint64_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, uint64_t valueRangeSize, uint64_t minValue);
+TightDataPointStorageI* SZ_compress_uint64_2D_MDQ(uint64_t *oriData, size_t r1, size_t r2, double realPrecision, uint64_t valueRangeSize, uint64_t minValue);
+TightDataPointStorageI* SZ_compress_uint64_3D_MDQ(uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, uint64_t valueRangeSize, uint64_t minValue);
+void SZ_compress_args_uint64_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, uint64_t valueRangeSize, uint64_t minValue);
+TightDataPointStorageI* SZ_compress_uint64_4D_MDQ(uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, uint64_t valueRangeSize, uint64_t minValue);
+void SZ_compress_args_uint64_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, uint64_t valueRangeSize, uint64_t minValue);
+void SZ_compress_args_uint64_withinRange(unsigned char** newByteData, uint64_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_uint64_wRngeNoGzip(unsigned char** newByteData, uint64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_uint64(unsigned char** newByteData, uint64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_UInt64_H  ----- */
+
--- a/deps/SZ/sz/include/sz_uint8.h
+++ b/deps/SZ/sz/include/sz_uint8.h
@ -0,0 +1,48 @@
+/**
+ *  @file sz_uint8.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_uint8.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_UInt8_H
+#define _SZ_UInt8_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_uint8_1D(uint8_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_uint8_2D(uint8_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_uint8_3D(uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_uint8_4D(uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_uint8_1D_MDQ(uint8_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint8_StoreOriData(uint8_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_uint8_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint8_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint8_t minValue);
+TightDataPointStorageI* SZ_compress_uint8_2D_MDQ(uint8_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint8_3D_MDQ(uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint8_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint8_4D_MDQ(uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint8_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint8_withinRange(unsigned char** newByteData, uint8_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_uint8_wRngeNoGzip(unsigned char** newByteData, uint8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_uint8(unsigned char** newByteData, uint8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_UInt8_H  ----- */
+
--- a/deps/SZ/sz/include/szd_double.h
+++ b/deps/SZ/sz/include/szd_double.h
@ -0,0 +1,43 @@
+/**
+ *  @file szd_double.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_double.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Double_H
+#define _SZD_Double_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageD.h"
+
+void decompressDataSeries_double_1D(double** data, size_t dataSeriesLength, double* hist_data, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_2D(double** data, size_t r1, size_t r2, double* hist_data, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_3D(double** data, size_t r1, size_t r2, size_t r3, double* hist_data, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_4D(double** data, size_t r1, size_t r2, size_t r3, size_t r4, double* hist_data, TightDataPointStorageD* tdps);
+
+void decompressDataSeries_double_1D_MSST19(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_2D_MSST19(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_3D_MSST19(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps);
+
+void getSnapshotData_double_1D(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps, int errBoundMode, int compressionType, double* hist_data);
+void getSnapshotData_double_2D(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps, int errBoundMode, int compressionType, double* hist_data);
+void getSnapshotData_double_3D(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps, int errBoundMode, int compressionType, double* hist_data);
+void getSnapshotData_double_4D(double** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageD* tdps, int errBoundMode, int compressionType, double* hist_data);
+void decompressDataSeries_double_2D_nonblocked_with_blocked_regression(double** data, size_t r1, size_t r2, unsigned char* comp_data, double* hist_data);
+void decompressDataSeries_double_3D_nonblocked_with_blocked_regression(double** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data, double* hist_data);
+
+size_t decompressDataSeries_double_3D_RA_block(double * data, double mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, int * type, double * unpredictable_data);
+
+int SZ_decompress_args_double(double** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize, int compressionType, double* hist_data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Double_H  ----- */
--- a/deps/SZ/sz/include/szd_double_pwr.h
+++ b/deps/SZ/sz/include/szd_double_pwr.h
@ -0,0 +1,36 @@
+/**
+ *  @file szd_double_pwr.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_double_pwr.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Double_PWR_H
+#define _SZD_Double_PWR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void decompressDataSeries_double_1D_pwr(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+double* extractRealPrecision_2D_double(size_t R1, size_t R2, int blockSize, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_2D_pwr(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps);
+double* extractRealPrecision_3D_double(size_t R1, size_t R2, size_t R3, int blockSize, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_3D_pwr(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps);
+
+void decompressDataSeries_double_1D_pwrgroup(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_1D_pwr_pre_log(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_2D_pwr_pre_log(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_3D_pwr_pre_log(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps);
+
+void decompressDataSeries_double_1D_pwr_pre_log_MSST19(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_2D_pwr_pre_log_MSST19(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_3D_pwr_pre_log_MSST19(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Double_PWR_H  ----- */
--- a/deps/SZ/sz/include/szd_double_ts.h
+++ b/deps/SZ/sz/include/szd_double_ts.h
@ -0,0 +1,25 @@
+/**
+ *  @file szd_double_ts.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_double_ts.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Double_TS_H
+#define _SZD_Double_TS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageD.h"
+
+void decompressDataSeries_double_1D_ts(double** data, size_t dataSeriesLength, double* hist_data, TightDataPointStorageD* tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Double_TS_H  ----- */
--- a/deps/SZ/sz/include/szd_float.h
+++ b/deps/SZ/sz/include/szd_float.h
@ -0,0 +1,58 @@
+/**
+ *  @file szd_float.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_float.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Float_H
+#define _SZD_Float_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageF.h"
+
+void decompressDataSeries_float_1D(float** data, size_t dataSeriesLength, float* hist_data, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_2D(float** data, size_t r1, size_t r2, float* hist_data, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_3D(float** data, size_t r1, size_t r2, size_t r3, float* hist_data, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_4D(float** data, size_t r1, size_t r2, size_t r3, size_t r4, float* hist_data, TightDataPointStorageF* tdps);
+
+void decompressDataSeries_float_1D_MSST19(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_2D_MSST19(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_3D_MSST19(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps);
+
+void getSnapshotData_float_1D(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps, int errBoundMode, int compressionType, float* hist_data);
+void getSnapshotData_float_2D(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps, int errBoundMode, int compressionType, float* hist_data);
+void getSnapshotData_float_3D(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps, int errBoundMode, int compressionType, float* hist_data);
+void getSnapshotData_float_4D(float** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageF* tdps, int errBoundMode, int compressionType, float* hist_data);
+
+size_t decompressDataSeries_float_1D_RA_block(float * data, float mean, size_t dim_0, size_t block_dim_0, double realPrecision, int * type, float * unpredictable_data);
+size_t decompressDataSeries_float_2D_RA_block(float * data, float mean, size_t dim_0, size_t dim_1, size_t block_dim_0, size_t block_dim_1, double realPrecision, int * type, float * unpredictable_data);
+
+int SZ_decompress_args_float(float** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize, int compressionType, float* hist_data);
+
+size_t decompressDataSeries_float_3D_RA_block(float * data, float mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, int * type, float * unpredictable_data);
+
+void decompressDataSeries_float_1D_decompression_given_areas_with_blocked_regression(float** data, size_t r1, size_t s1, size_t e1, unsigned char* comp_data);
+
+void decompressDataSeries_float_2D_nonblocked_with_blocked_regression(float** data, size_t r1, size_t r2, unsigned char* comp_data, float* hist_data);
+void decompressDataSeries_float_2D_decompression_given_areas_with_blocked_regression(float** data, size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2, unsigned char* comp_data);
+void decompressDataSeries_float_3D_nonblocked_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data, float* hist_data);
+void decompressDataSeries_float_3D_random_access_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+void decompressDataSeries_float_3D_decompression_random_access_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+void decompressDataSeries_float_3D_decompression_given_areas_with_blocked_regression(float** data, size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3, unsigned char* comp_data);
+int SZ_decompress_args_randomaccess_float(float** newData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, 
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1, // start point
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1, // end point
+unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Float_H  ----- */
--- a/deps/SZ/sz/include/szd_float_pwr.h
+++ b/deps/SZ/sz/include/szd_float_pwr.h
@ -0,0 +1,38 @@
+/**
+ *  @file szd_float_pwr.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_float_pwr.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Float_PWR_H
+#define _SZD_Float_PWR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void decompressDataSeries_float_1D_pwr(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+float* extractRealPrecision_2D_float(size_t R1, size_t R2, int blockSize, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_2D_pwr(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps);
+float* extractRealPrecision_3D_float(size_t R1, size_t R2, size_t R3, int blockSize, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_3D_pwr(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps);
+
+char* decompressGroupIDArray(unsigned char* bytes, size_t dataLength);
+void decompressDataSeries_float_1D_pwrgroup(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_1D_pwr_pre_log(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_2D_pwr_pre_log(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_3D_pwr_pre_log(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps);
+
+void decompressDataSeries_float_1D_pwr_pre_log_MSST19(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_2D_pwr_pre_log_MSST19(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_3D_pwr_pre_log_MSST19(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Float_PWR_H  ----- */
+
--- a/deps/SZ/sz/include/szd_float_ts.h
+++ b/deps/SZ/sz/include/szd_float_ts.h
@ -0,0 +1,25 @@
+/**
+ *  @file szd_float_ts.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_float_ts.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Float_TS_H
+#define _SZD_Float_TS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageF.h"
+
+void decompressDataSeries_float_1D_ts(float** data, size_t dataSeriesLength, float* hist_data, TightDataPointStorageF* tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Float_TS_H  ----- */
--- a/deps/SZ/sz/include/szd_int16.h
+++ b/deps/SZ/sz/include/szd_int16.h
@ -0,0 +1,38 @@
+/**
+ *  @file szd_int16.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_int16.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Int16_H
+#define _SZD_Int16_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_INT16_MIN -32768
+#define SZ_INT16_MAX 32767
+
+void decompressDataSeries_int16_1D(int16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_int16_2D(int16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_int16_3D(int16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_int16_4D(int16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_int16_1D(int16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int16_2D(int16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int16_3D(int16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int16_4D(int16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_int16(int16_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int16_H  ----- */
--- a/deps/SZ/sz/include/szd_int32.h
+++ b/deps/SZ/sz/include/szd_int32.h
@ -0,0 +1,38 @@
+/**
+ *  @file szd_int32.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_int32.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Int32_H
+#define _SZD_Int32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_INT32_MIN -2147483648
+#define SZ_INT32_MAX 2147483647
+
+void decompressDataSeries_int32_1D(int32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_int32_2D(int32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_int32_3D(int32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_int32_4D(int32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_int32_1D(int32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int32_2D(int32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int32_3D(int32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int32_4D(int32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_int32(int32_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int32_H  ----- */
--- a/deps/SZ/sz/include/szd_int64.h
+++ b/deps/SZ/sz/include/szd_int64.h
@ -0,0 +1,35 @@
+/**
+ *  @file szd_int64.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_int64.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Int64_H
+#define _SZD_Int64_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+void decompressDataSeries_int64_1D(int64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_int64_2D(int64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_int64_3D(int64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_int64_4D(int64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_int64_1D(int64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int64_2D(int64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int64_3D(int64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int64_4D(int64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_int64(int64_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int64_H  ----- */
--- a/deps/SZ/sz/include/szd_int8.h
+++ b/deps/SZ/sz/include/szd_int8.h
@ -0,0 +1,38 @@
+/**
+ *  @file szd_int8.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_int8.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Int8_H
+#define _SZD_Int8_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_INT8_MIN -128
+#define SZ_INT8_MAX 127
+
+void decompressDataSeries_int8_1D(int8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_int8_2D(int8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_int8_3D(int8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_int8_4D(int8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_int8_1D(int8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int8_2D(int8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int8_3D(int8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int8_4D(int8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_int8(int8_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int8_H  ----- */
--- a/deps/SZ/sz/include/szd_uint16.h
+++ b/deps/SZ/sz/include/szd_uint16.h
@ -0,0 +1,38 @@
+/**
+ *  @file szd_uint16.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_uint16.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_UInt16_H
+#define _SZD_UInt16_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_UINT16_MIN 0
+#define SZ_UINT16_MAX 65535
+
+void decompressDataSeries_uint16_1D(uint16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint16_2D(uint16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint16_3D(uint16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint16_4D(uint16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_uint16_1D(uint16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint16_2D(uint16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint16_3D(uint16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint16_4D(uint16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_uint16(uint16_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int16_H  ----- */
--- a/deps/SZ/sz/include/szd_uint32.h
+++ b/deps/SZ/sz/include/szd_uint32.h
@ -0,0 +1,38 @@
+/**
+ *  @file szd_uint32.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_uint32.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_UInt32_H
+#define _SZD_UInt32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_UINT32_MIN 0
+#define SZ_UINT32_MAX 4294967295
+
+void decompressDataSeries_uint32_1D(uint32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint32_2D(uint32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint32_3D(uint32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint32_4D(uint32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_uint32_1D(uint32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint32_2D(uint32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint32_3D(uint32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint32_4D(uint32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_uint32(uint32_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_UInt32_H  ----- */
--- a/deps/SZ/sz/include/szd_uint64.h
+++ b/deps/SZ/sz/include/szd_uint64.h
@ -0,0 +1,35 @@
+/**
+ *  @file szd_uint64.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_uint64.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_UInt64_H
+#define _SZD_UInt64_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+void decompressDataSeries_uint64_1D(uint64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint64_2D(uint64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint64_3D(uint64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint64_4D(uint64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_uint64_1D(uint64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint64_2D(uint64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint64_3D(uint64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint64_4D(uint64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_uint64(uint64_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_UInt64_H  ----- */
--- a/deps/SZ/sz/include/szd_uint8.h
+++ b/deps/SZ/sz/include/szd_uint8.h
@ -0,0 +1,38 @@
+/**
+ *  @file szd_uint8.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_uint8.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_UInt8_H
+#define _SZD_UInt8_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_UINT8_MIN 0
+#define SZ_UINT8_MAX 255
+
+void decompressDataSeries_uint8_1D(uint8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint8_2D(uint8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint8_3D(uint8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint8_4D(uint8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_uint8_1D(uint8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint8_2D(uint8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint8_3D(uint8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint8_4D(uint8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_uint8(uint8_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_UInt8_H  ----- */
--- a/deps/SZ/sz/include/szf.h
+++ b/deps/SZ/sz/include/szf.h
@ -0,0 +1,102 @@
+/**
+ *  @file szf.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szf.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZF_H
+#define _SZF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+//szf.c
+void sz_init_c_(char *configFile,int *len,int *ierr);
+void sz_finalize_c_();
+void SZ_writeData_inBinary_d1_Float_(float* data, char *fileName, int *len);
+void sz_compress_d1_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1);
+void sz_compress_d1_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1);
+void sz_compress_d2_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2);
+void sz_compress_d2_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2);
+void sz_compress_d3_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d3_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d4_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_compress_d5_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_compress_d1_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1);
+void sz_compress_d1_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1);
+void sz_compress_d2_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2);
+void sz_compress_d2_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2);
+void sz_compress_d3_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d3_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d4_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_compress_d5_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_compress_d1_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1);
+void sz_compress_d2_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2);
+void sz_compress_d3_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_compress_d1_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1);
+void sz_compress_d2_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2);
+void sz_compress_d3_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_compress_d1_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1);
+void sz_compress_d2_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2);
+void sz_compress_d3_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_compress_d1_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1);
+void sz_compress_d2_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2);
+void sz_compress_d3_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_double_rev_args_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_double_rev_args_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_decompress_d1_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1);
+void sz_decompress_d2_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2);
+void sz_decompress_d3_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3);
+void sz_decompress_d4_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_decompress_d5_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_decompress_d1_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1);
+void sz_decompress_d2_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2);
+void sz_decompress_d3_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3);
+void sz_decompress_d4_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_decompress_d5_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_batchaddVar_d1_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1);
+void sz_batchaddvar_d2_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2);
+void sz_batchaddvar_d3_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_batchaddvar_d4_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_batchaddvar_d5_float_(int var_id, char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_batchaddvar_d1_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1);
+void sz_batchaddvar_d2_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2);
+void sz_batchaddvar_d3_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_batchaddvar_d4_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_batchaddvar_d5_double_(int var_id, char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_batchdelvar_c_(char* varName, int *len, int *errState);
+void sz_batch_compress_c_(unsigned char* bytes, size_t *outSize);
+void sz_batch_decompress_c_(unsigned char* bytes, size_t *byteLength, int *ierr);
+void sz_getvardim_c_(char* varName, int *len, int *dim, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void compute_total_batch_size_c_(size_t *totalSize);
+void sz_getvardata_float_(char* varName, int *len, float* data);
+void sz_getvardata_double_(char* varName, int *len, double* data);
+void sz_freevarset_c_(int *mode);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZF_H  ----- */
+
--- a/deps/SZ/sz/include/utility.h
+++ b/deps/SZ/sz/include/utility.h
@ -0,0 +1,45 @@
+/**
+ *  @file utility.h
+ *  @author Sheng Di, Sihuan Li
+ *  @date July, 2018
+ *  @brief Header file for the utility.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _UTILITY_H
+#define _UTILITY_H
+
+#include "sz.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//sihuan added: use a assistant struct to do sorting and swap that are easy to implement: should
+//consider optimizing the performance later.
+typedef struct sort_ast_particle{
+	int64_t id;
+	float var[6];
+} sort_ast_particle;
+
+int compare_struct(const void* obj1, const void* obj2);//sihuan added: the compare function in the qsort parameter for 2 structures
+void reorder_vars(SZ_VarSet* vset);//sihuan added: reorder the variables increasingly by their index
+size_t intersectAndsort(int64_t* preIndex, size_t preLen, SZ_VarSet* curVar, size_t dataLen, unsigned char* bitarray);
+//sihuan added: find intersection and keep new var sorted by id
+void write_reordered_tofile(SZ_VarSet* curVar, size_t dataLen);
+//sihuan added: write the reordered input to files for further decompression validation
+float calculate_delta_t(size_t size);//sihuan added
+
+int is_lossless_compressed_data(unsigned char* compressedBytes, size_t cmpSize);
+unsigned long sz_lossless_compress(int losslessCompressor, int level, unsigned char* data, unsigned long dataLength, unsigned char** compressBytes);
+unsigned long sz_lossless_decompress(int losslessCompressor, unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long sz_lossless_decompress65536bytes(int losslessCompressor, unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData);
+void* detransposeData(void* data, int dataType, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+void* transposeData(void* data, int dataType, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _UTILITY_H  ----- */
--- a/deps/SZ/sz/src/ArithmeticCoding.c
+++ b/deps/SZ/sz/src/ArithmeticCoding.c
@ -0,0 +1,692 @@
+/**
+ *  @file ArithmeticCoding.c
+ *  @author Sheng Di, Mark Thomas Nelson
+ *  @date April, 2016
+ *  @brief Byte Toolkit
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *  (C) The MIT License (MIT), this code was modified from Mark's arithmetic coding code: http://www.drdobbs.com/cpp/data-compression-with-arithmetic-encodin/240169251?pgno=1
+ */
+#include <sz.h>
+#include <ArithmeticCoding.h>
+
+inline void output_bit_1(unsigned int* buf)
+{
+	(*buf) = (*buf) << 1;
+	(*buf) |= 1;
+}
+
+inline void output_bit_0(unsigned int* buf)
+{
+	(*buf) = (*buf) << 1;
+	//(*byte) |= 0; //actually doesn't have to set the bit to 0
+}
+
+//TODO: problematic
+inline unsigned int output_bit_1_plus_pending(int pending_bits)
+{
+	unsigned int buf = 0, pbits = pending_bits;
+	output_bit_1(&buf);
+	while(pbits--)
+		output_bit_0(&buf);
+	buf = buf << (32-(pending_bits+1)); //alignment to the left leading bit, which would be easier for the final output
+	return buf;
+}
+
+inline unsigned int output_bit_0_plus_pending(int pending_bits)
+{
+	unsigned int buf = 0, pbits = pending_bits;
+	//output_bit_0(&buf);
+	while(pbits--)
+		output_bit_1(&buf);
+	buf = buf << (32-(pending_bits+1)); //alignment to the left leading bit
+	return buf;
+}
+
+/**
+ * Create AriCoder for the following arithmetic encoding operation. 
+ * In this function, it will compute the real frequency of the integer codes.
+ * @param int numOfStates (input): numOfStates is the real # states calculated to the optimization_num_of_interval code
+ * @param int *s (input): the integer code array (i.e., type_array generated by prediction+quantization)
+ * @param size_t length: the number of integer codes in the type_array
+ * 
+ * */
+AriCoder *createAriCoder(int numOfStates, int *s, size_t length)
+{
+	AriCoder *ariCoder = (AriCoder*)malloc(sizeof(AriCoder));
+	memset(ariCoder, 0, sizeof(AriCoder));
+	ariCoder->numOfRealStates = numOfStates;
+	ari_init(ariCoder, s, length);
+    return ariCoder;
+}
+
+void freeAriCoder(AriCoder *ariCoder)
+{
+	free(ariCoder->cumulative_frequency);
+	free(ariCoder);
+}
+
+void ari_init(AriCoder *ariCoder, int *s, size_t length)
+{
+	size_t i; //# states is in the range of integer.
+	int index = 0;
+	size_t *freq = (size_t *)malloc(ariCoder->numOfRealStates*sizeof(size_t));
+	memset(freq, 0, ariCoder->numOfRealStates*sizeof(size_t));
+	for(i = 0;i < length;i++) 
+	{
+		index = s[i];
+		freq[index]++;
+	}
+ 
+	int counter = 0;
+	size_t _sum = 0, sum = 0, freqDiv = 0;
+	ariCoder->cumulative_frequency = (Prob *)malloc(ariCoder->numOfRealStates*sizeof(Prob));
+	
+	memset(ariCoder->cumulative_frequency, 0, ariCoder->numOfRealStates*sizeof(Prob));
+	
+	if(length <= MAX_INTERVALS)
+	{
+		for (index = 0; index < ariCoder->numOfRealStates; index++)
+		{
+			if (freq[index]) 
+			{
+				sum += freq[index];
+				(ariCoder->cumulative_frequency[index]).low = _sum;
+				(ariCoder->cumulative_frequency[index]).high = sum;
+				(ariCoder->cumulative_frequency[index]).state = index;
+				_sum = sum;
+				counter++;
+			}
+		}
+		ariCoder->numOfValidStates = counter;
+		ariCoder->total_frequency = sum;		
+	}
+	else
+	{
+		int intvSize = length%MAX_INTERVALS==0?length/MAX_INTERVALS:length/MAX_INTERVALS+1;
+		for (index = 0; index < ariCoder->numOfRealStates; index++)
+		{
+			if (freq[index]) 
+			{
+				freqDiv = freq[index]/intvSize; //control the sum of frequency to be no greater than MAX_INTERVALS
+				if(freqDiv==0)
+					freqDiv = 1;
+				sum += freqDiv;
+				(ariCoder->cumulative_frequency[index]).low = _sum;
+				(ariCoder->cumulative_frequency[index]).high = sum;
+				(ariCoder->cumulative_frequency[index]).state = index;
+				_sum = sum;
+				counter++;
+			}
+		}
+		ariCoder->numOfValidStates = counter;
+		ariCoder->total_frequency = sum;			
+	}
+
+	free(freq);
+}
+
+/**
+ * Convert AriCoder to bytes for storage
+ * @param AriCoder* ariCoder (input)
+ * @param unsigned char** out (output)
+ * 
+ * @return outSize
+ * */
+unsigned int pad_ariCoder(AriCoder* ariCoder, unsigned char** out)
+{
+	int numOfRealStates = ariCoder->numOfRealStates;
+	int numOfValidStates = ariCoder->numOfValidStates;
+	uint64_t total_frequency = ariCoder->total_frequency;
+	Prob* cumulative_frequency = ariCoder->cumulative_frequency;
+	
+	unsigned int outSize = 0;
+	*out = (unsigned char*)malloc(2*sizeof(int)+sizeof(uint64_t)+sizeof(Prob)*numOfRealStates);
+	
+	unsigned char* p = *out;
+	intToBytes_bigEndian(p, numOfRealStates);
+	p+=sizeof(int);
+	intToBytes_bigEndian(p, numOfValidStates);
+	p+=sizeof(int);
+	int64ToBytes_bigEndian(p, total_frequency);
+	p+=sizeof(uint64_t);
+	size_t i = 0;
+	if(total_frequency <= 65536)
+	{
+		uint16_t low, high;
+		if(numOfRealStates<=256)
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint16_t)(cumulative_frequency[i].high);				
+				if(high!=0) //if this state cell is not null
+				{
+					low = (uint16_t)(cumulative_frequency[i].low);
+					int16ToBytes_bigEndian(p,low);
+					p+=sizeof(uint16_t);
+					int16ToBytes_bigEndian(p,high);
+					p+=sizeof(uint16_t);
+					*(p++)=(unsigned char)cumulative_frequency[i].state;
+					//if(((unsigned char)cumulative_frequency[i].state)==129)
+					//	printf("break i=%zu\n", i);
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*5; //2*sizeof(uint16_t)+1
+		}
+		else if(numOfRealStates<=65536)
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint16_t)(cumulative_frequency[i].high);				
+				if(high!=0)
+				{
+					low = (uint16_t)(cumulative_frequency[i].low);
+					int16ToBytes_bigEndian(p,low);
+					p+=sizeof(uint16_t);
+					int16ToBytes_bigEndian(p,high);
+					p+=sizeof(uint16_t);
+					uint16_t state = (uint16_t)cumulative_frequency[i].state;
+					int16ToBytes_bigEndian(p, state);
+					p+=sizeof(uint16_t);
+				}
+			}	
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*6;
+		}
+		else
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint16_t)(cumulative_frequency[i].high);				
+				if(high!=0)
+				{
+					low = (uint16_t)(cumulative_frequency[i].low);
+					int16ToBytes_bigEndian(p,low);
+					p+=sizeof(uint16_t);
+					int16ToBytes_bigEndian(p,high);
+					p+=sizeof(uint16_t);
+					int32ToBytes_bigEndian(p, cumulative_frequency[i].state);
+					p+=sizeof(uint32_t);
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*8;
+		}
+	}
+	else if(total_frequency <=4294967296)
+	{
+		uint32_t low, high;
+		if(numOfRealStates<=256)
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint32_t)(cumulative_frequency[i].high);				
+				if(high!=0)
+				{
+					low = (uint32_t)(cumulative_frequency[i].low);
+					int32ToBytes_bigEndian(p,low);
+					p+=sizeof(uint32_t);
+					int32ToBytes_bigEndian(p,high);
+					p+=sizeof(uint32_t);
+					*(p++)=(unsigned char)cumulative_frequency[i].state;					
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*9;
+		}
+		else if(numOfRealStates<=65536)
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint32_t)(cumulative_frequency[i].high);
+				if(high!=0)
+				{
+					low = (uint32_t)(cumulative_frequency[i].low);
+					int32ToBytes_bigEndian(p,low);
+					p+=sizeof(uint32_t);
+					int32ToBytes_bigEndian(p,high);
+					p+=sizeof(uint32_t);
+					uint16_t state = (uint16_t)cumulative_frequency[i].state;
+					int16ToBytes_bigEndian(p, state);
+					p+=sizeof(uint16_t);
+					
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*10;
+		}
+		else
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint32_t)(cumulative_frequency[i].high);
+				if(high!=0)
+				{
+					low = (uint32_t)(cumulative_frequency[i].low);
+					int32ToBytes_bigEndian(p,low);
+					p+=sizeof(uint32_t);
+					int32ToBytes_bigEndian(p,high);
+					p+=sizeof(uint32_t);
+					int32ToBytes_bigEndian(p, cumulative_frequency[i].state);
+					p+=sizeof(uint32_t);
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*12;
+		}
+	}
+	else
+	{
+		uint64_t low, high;
+		if(numOfRealStates<=256)
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint64_t)(cumulative_frequency[i].high);
+				if(high!=0)
+				{
+					low = (uint64_t)(cumulative_frequency[i].low);
+					int64ToBytes_bigEndian(p,low);
+					p+=sizeof(uint64_t);
+					int64ToBytes_bigEndian(p,high);
+					p+=sizeof(uint64_t);
+					*(p++)=(unsigned char)cumulative_frequency[i].state;
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*17;
+		}
+		else if(numOfRealStates<=65536)
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint64_t)(cumulative_frequency[i].high);
+				if(high!=0)
+				{
+					low = (uint64_t)(cumulative_frequency[i].low);
+					int64ToBytes_bigEndian(p,low);
+					p+=sizeof(uint64_t);
+					int64ToBytes_bigEndian(p,high);
+					p+=sizeof(uint64_t);
+					uint16_t state = (uint16_t)cumulative_frequency[i].state;
+					int16ToBytes_bigEndian(p, state);
+					p+=sizeof(uint16_t);					
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*18;
+		}
+		else
+		{
+			for(i=0;i<numOfRealStates;i++)
+			{
+				high = (uint64_t)(cumulative_frequency[i].high);
+				if(high!=0)
+				{
+					low = (uint64_t)(cumulative_frequency[i].low);
+					int64ToBytes_bigEndian(p,low);
+					p+=sizeof(uint64_t);
+					int64ToBytes_bigEndian(p,high);
+					p+=sizeof(uint64_t);
+					int32ToBytes_bigEndian(p, cumulative_frequency[i].state);
+					p+=sizeof(uint32_t);					
+				}
+			}
+			outSize = 2*sizeof(int)+sizeof(uint64_t)+ariCoder->numOfValidStates*20;
+		}
+	}
+	return outSize;
+}
+
+/**
+ * Reconstruct AriCoder based on the bytes loaded from compressed data
+ * @param AriCoder** ariCoder (ourput)
+ * @param unsigned char* bytes (input)
+ * 
+ * @return offset
+ * */
+int unpad_ariCoder(AriCoder** ariCoder, unsigned char* bytes)
+{
+	int offset = 0;
+	
+	*ariCoder = (AriCoder*)malloc(sizeof(AriCoder));
+	memset(*ariCoder, 0, sizeof(AriCoder));
+	
+	unsigned char *p = bytes;
+	int numOfRealStates = (*ariCoder)->numOfRealStates = bytesToInt_bigEndian(p);
+	p += sizeof(int);
+	int numOfValidStates = (*ariCoder)->numOfValidStates = bytesToInt_bigEndian(p);
+	p += sizeof(int);
+	size_t total_frequency = (*ariCoder)->total_frequency = bytesToInt64_bigEndian(p);
+	p += sizeof(uint64_t);
+	
+	(*ariCoder)->cumulative_frequency = (Prob*)malloc((*ariCoder)->numOfRealStates*sizeof(Prob));
+	memset((*ariCoder)->cumulative_frequency, 0, (*ariCoder)->numOfRealStates*sizeof(Prob));
+	size_t i = 0;
+	unsigned char *low_p = NULL, *high_p = NULL, *state_p = NULL;
+	int state = 0;
+	if(total_frequency <= 65536)
+	{
+		if(numOfRealStates<=256)
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint16_t);
+				state_p = high_p+sizeof(uint16_t);
+				state = *state_p;
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt16_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt16_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + 1;
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*5; //2*sizeof(uint16_t)+1
+		}
+		else if(numOfRealStates<=65536)
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint16_t);
+				state_p = high_p+sizeof(uint16_t);
+				state = bytesToUInt16_bigEndian(state_p);
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt16_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt16_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + sizeof(uint16_t);
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*6;
+		}
+		else
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint16_t);
+				state_p = high_p+sizeof(uint16_t);
+				state = bytesToUInt32_bigEndian(state_p);
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt16_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt16_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + sizeof(uint32_t);
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*8;
+		}
+	}
+	else if(total_frequency <=4294967296)
+	{
+		if(numOfRealStates<=256)
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint32_t);
+				state_p = high_p+sizeof(uint32_t);
+				state = *state_p;
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt32_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt32_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + 1;
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*9;
+		}
+		else if(numOfRealStates<=65536)
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint32_t);
+				state_p = high_p+sizeof(uint32_t);
+				state = bytesToUInt16_bigEndian(state_p);
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt32_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt32_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + sizeof(uint16_t);
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*10;
+		}
+		else
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint32_t);
+				state_p = high_p+sizeof(uint32_t);
+				state = bytesToUInt32_bigEndian(state_p);
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt32_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt32_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + sizeof(uint32_t);
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*12;
+		}
+	}
+	else
+	{
+		if(numOfRealStates<=256)
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint64_t);
+				state_p = high_p+sizeof(uint64_t);
+				state = *state_p;
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt64_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt64_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + 1;
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*17;
+		}
+		else if(numOfRealStates<=65536)
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint64_t);
+				state_p = high_p+sizeof(uint64_t);
+				state = bytesToUInt16_bigEndian(state_p);
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt64_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt64_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + sizeof(uint16_t);
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*18;
+		}
+		else
+		{
+			for(i=0;i<numOfValidStates;i++)
+			{
+				low_p = p;
+				high_p = low_p+sizeof(uint64_t);
+				state_p = high_p+sizeof(uint64_t);
+				state = bytesToUInt32_bigEndian(state_p);
+				
+				(*ariCoder)->cumulative_frequency[state].low = bytesToUInt64_bigEndian(low_p);
+				(*ariCoder)->cumulative_frequency[state].high = bytesToUInt64_bigEndian(high_p);
+				(*ariCoder)->cumulative_frequency[state].state = state;
+				
+				p = state_p + sizeof(uint32_t);
+			}
+			offset = 2*sizeof(int)+sizeof(uint64_t)+(*ariCoder)->numOfValidStates*20;
+		}
+	}
+	return offset;
+}
+
+/**
+ * Arithmetic Encoding
+ * @param AriCoder *ariCoder (input)
+ * @param int *s (input)
+ * @param size_t length (input)
+ * @param unsigned char *out (output)
+ * @param size_t *outSize (output)
+ * 
+ * */
+void ari_encode(AriCoder *ariCoder, int *s, size_t length, unsigned char *out, size_t *outSize)
+{
+	int pending_bits = 0;
+	size_t low = 0;
+	size_t high = MAX_CODE;
+	size_t i = 0, range = 0;
+	size_t count = ariCoder->total_frequency;
+	int c = 0, lackBits = 0;
+	*outSize = 0;
+	
+	unsigned char *outp = out;
+	
+	Prob *cumulative_frequency = ariCoder->cumulative_frequency;
+	unsigned int buf = 0;
+	
+	for (i=0;i<length;i++)
+	{
+		c = s[i];
+		Prob p = cumulative_frequency[c];
+		range = high - low + 1;
+		high = low + (range * p.high / count) - 1;
+		low = low + (range * p.low / count);
+		for ( ; ; ) 
+		{
+			if ( high < ONE_HALF )
+			{
+				buf = output_bit_0_plus_pending(pending_bits);
+				put_codes_to_output(buf, pending_bits+1, &outp, &lackBits, outSize);
+				pending_bits = 0;
+			}
+			else if ( low >= ONE_HALF )
+			{
+				buf = output_bit_1_plus_pending(pending_bits);
+				put_codes_to_output(buf, pending_bits+1, &outp, &lackBits, outSize);
+				pending_bits = 0;			
+			}
+			else if ( low >= ONE_FOURTH && high < THREE_FOURTHS ) 
+			{
+				pending_bits++;
+				low -= ONE_FOURTH;
+				high -= ONE_FOURTH;
+			} else
+				break;
+			high <<= 1;
+			high++;
+			low <<= 1;
+			high &= MAX_CODE;
+			low &= MAX_CODE;
+		}
+	}
+	pending_bits++;
+	if(low < ONE_FOURTH)
+	{
+		buf = output_bit_0_plus_pending(pending_bits);
+		put_codes_to_output(buf, pending_bits+1, &outp, &lackBits, outSize);
+	}
+	else
+	{
+		buf = output_bit_1_plus_pending(pending_bits);
+		put_codes_to_output(buf, pending_bits+1, &outp, &lackBits, outSize);
+	}	
+}
+
+/**
+ * Get the integer code based on Arithmetic Coding Value 
+ * @param AriCoder *ariCoder (input)
+ * @param size_t scaled_value (input)
+ * 
+ * @return Prob* (output)
+ * 
+ * */
+Prob* getCode(AriCoder *ariCoder, size_t scaled_value)
+{
+	int numOfRealStates = ariCoder->numOfRealStates;
+	int i = 0;
+	Prob *p = ariCoder->cumulative_frequency;
+	for(i=0;i<numOfRealStates;i++,p++)
+	{
+		if(scaled_value < p->high)
+			break;
+	}
+	return p;
+}
+
+/**
+ * Get one bit from the input stream of bytes
+ * @param unsigned char* p (input): the current location to be read (byte) of the byte stream
+ * @param int offset (input): the offset of the specified byte in the byte stream
+ * 
+ * @return unsigned char (output) : 1 or 0
+ * */
+inline unsigned char get_bit(unsigned char* p, int offset)
+{
+	return ((*p) >> (7-offset)) & 0x01;
+}
+
+/**
+ * Arithmetic Decoding algorithm 
+ * @param AriCoder *ariCoder (input): the encoder with the constructed frequency information
+ * @param unsigned char *s (input): the compressed stream of bytes
+ * @param size_t s_len (input): the number of bytes in the 'unsigned char *s'
+ * @param size_t targetLength (input): the target number of elements in the type array
+ * @param int *out (output) : the result (type array decompressed from the stream 's')
+ * 
+ * */
+void ari_decode(AriCoder *ariCoder, unsigned char *s, size_t s_len, size_t targetLength, int *out)
+{
+	size_t high = MAX_CODE;
+	size_t low = 0, i = 0;
+	size_t range = 0, scaled_value = 0;
+	size_t total_frequency = ariCoder->total_frequency;
+	unsigned char *sp = s+5;
+	unsigned int offset = 4;
+	size_t value = (bytesToUInt64_bigEndian(s) >> 20); //alignment with the MAX_CODE
+	size_t s_counter = sizeof(int);
+	
+	for(i=0;i<targetLength;i++)
+	{
+		range = high -  low + 1;
+		scaled_value = ((value - low + 1) * ariCoder->total_frequency  - 1 ) / range;
+		Prob *p = getCode(ariCoder, scaled_value);
+		out[i] = p->state;  //output the state to the 'out' array
+		high = low + (range*p->high)/total_frequency -1;
+		low = low + (range*p->low)/total_frequency;
+		
+		for( ; ; )
+		{
+			if (high < ONE_HALF) {
+			  //do nothing, bit is a zero
+			} else if ( low >= ONE_HALF ) 
+			{
+			  value -= ONE_HALF;  //subtract one half from all three code values
+			  low -= ONE_HALF;
+			  high -= ONE_HALF;
+			} else if ( low >= ONE_FOURTH && high < THREE_FOURTHS ) 
+			{
+			  value -= ONE_FOURTH;
+			  low -= ONE_FOURTH;
+			  high -= ONE_FOURTH;
+			} else
+			  break;
+			low <<= 1;
+			high <<= 1;
+			high++;
+			value <<= 1;
+			//load one bit from the input byte stream	
+			if(s_counter < s_len)
+			{	
+				value += get_bit(sp, offset++);
+				if(offset==8)
+				{
+					sp++;
+					s_counter++;
+					offset = 0;
+				}
+			}
+		}
+	}
+}
--- a/deps/SZ/sz/src/ByteToolkit.c
+++ b/deps/SZ/sz/src/ByteToolkit.c
--- a/deps/SZ/sz/src/CacheTable.c
+++ b/deps/SZ/sz/src/CacheTable.c
@ -0,0 +1,100 @@
+/**
+ *  @file CacheTable.c
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Cache Table
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h>
+#include "CacheTable.h"
+
+double* g_CacheTable;
+uint32_t * g_InverseTable;
+uint32_t baseIndex;
+uint32_t topIndex;
+int bits;
+
+inline int doubleGetExpo(double d){
+    long* ptr = (long*)&d;
+    *ptr = ((*ptr) >> 52) - 1023;
+    return *ptr;
+}
+
+int CacheTableGetRequiredBits(double precision, int quantization_intervals){
+    double min_distance = pow((1+precision), -(quantization_intervals>>1)) * precision;
+    return -(doubleGetExpo(min_distance));
+}
+
+inline uint32_t CacheTableGetIndex(float value, int bits){
+    uint32_t* ptr = (uint32_t*)&value;
+    int shift = 32 - 9 - bits;
+    if(shift>0){
+        return (*ptr) >> shift;
+    }else{
+        return 0;
+    }
+}
+
+inline uint64_t CacheTableGetIndexDouble(double value, int bits){
+    uint64_t* ptr = (uint64_t*)&value;
+    int shift = 64 - 12 - bits;
+    if(shift>0){
+        return (*ptr) >> shift;
+    }else{
+        return 0;
+    }
+}
+
+inline int CacheTableIsInBoundary(uint32_t index){
+    if(index <= topIndex && index > baseIndex){
+        return 1;
+    }else{
+        return 0;
+    }
+}
+
+void CacheTableBuild(double * table, int count, double smallest, double largest, double precision, int quantization_intervals){
+    bits = CacheTableGetRequiredBits(precision, quantization_intervals);
+    baseIndex = CacheTableGetIndex((float)smallest, bits)+1;
+    topIndex = CacheTableGetIndex((float)largest, bits);
+    uint32_t range = topIndex - baseIndex + 1;
+    g_InverseTable = (uint32_t *)malloc(sizeof(uint32_t) * range);
+
+    /*
+    uint32_t fillInPos = 0;
+    for(int i=0; i<count; i++){
+        if(i == 0){
+            continue;
+        }
+        uint32_t index = CacheTableGetIndex((float)table[i], bits) - baseIndex;
+        g_InverseTable[index] = i;
+        if(index > fillInPos){
+            for(int j=fillInPos; j<index; j++){
+                g_InverseTable[j] = g_InverseTable[index];
+            }
+        }
+        fillInPos = index + 1;
+    }
+     */
+    for(int i=count-1; i>0; i--){
+        uint32_t upperIndex = CacheTableGetIndex((float)table[i]*(1+precision), bits);
+        uint32_t lowerIndex = CacheTableGetIndex((float)table[i]/(1+precision), bits);
+        for(uint32_t j = lowerIndex; j<=upperIndex; j++){
+            if(j<baseIndex || j >topIndex){
+                continue;
+            }
+            g_InverseTable[j-baseIndex] = i;
+        }
+    }
+
+}
+
+inline uint32_t CacheTableFind(uint32_t index){
+    return g_InverseTable[index-baseIndex];
+}
+
+void CacheTableFree(){
+    free(g_InverseTable);
+}
--- a/deps/SZ/sz/src/CompressElement.c
+++ b/deps/SZ/sz/src/CompressElement.c
@ -0,0 +1,255 @@
+/**
+ *  @file CompressElement.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Functions of CompressElement
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wchar-subscripts"
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <math.h>
+#include <sz.h>
+#include <CompressElement.h>
+
+char* decompressGroupIDArray(unsigned char* bytes, size_t dataLength)
+{
+	HuffmanTree* huffmanTree = SZ_Reset(); //create a default huffman tree	
+	int* standGroupID = (int*)malloc(dataLength*sizeof(int));
+	decode_withTree(huffmanTree, bytes, dataLength, standGroupID);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	char* groupID = (char*)malloc(dataLength*sizeof(char));
+	size_t i = 0;
+	int lastGroupIDValue = 0, curStandIDValue = 0, curGroupIDValue = 0;
+	int offset = 2*(GROUP_COUNT + 2);
+	
+	curGroupIDValue = groupID[0] = standGroupID[0] - GROUP_COUNT;
+	lastGroupIDValue = curGroupIDValue;
+	for(i=1;i<dataLength;i++)
+	{
+		curStandIDValue = standGroupID[i];
+		curGroupIDValue = curStandIDValue + lastGroupIDValue - offset;
+		lastGroupIDValue = curGroupIDValue;
+		groupID[i] = curGroupIDValue;
+	}
+	free(standGroupID);
+	
+	return groupID;
+}
+
+inline short computeGroupNum_float(float value)
+{
+	short expo = getExponent_float(value);
+	if(expo < 0)
+		expo = -1;
+	return expo;
+}
+
+inline short computeGroupNum_double(double value)
+{
+	short expo = getExponent_double(value);
+	if(expo < 0)
+		expo = -1;
+	return expo;
+}
+
+/**
+ * Add preceding neighbor values to a buffer.
+ * @param  last3CmprsData buffer
+ * @param  value the value to be added to the buffer
+ * */
+inline void listAdd_double(double last3CmprsData[3], double value)
+{
+	last3CmprsData[2] = last3CmprsData[1];
+	last3CmprsData[1] = last3CmprsData[0];
+	last3CmprsData[0] = value;
+}
+
+inline void listAdd_float(float last3CmprsData[3], float value)
+{
+	last3CmprsData[2] = last3CmprsData[1];
+	last3CmprsData[1] = last3CmprsData[0];
+	last3CmprsData[0] = value;
+}
+
+inline void listAdd_int(int64_t last3CmprsData[3], int64_t value)
+{
+	last3CmprsData[2] = last3CmprsData[1];
+	last3CmprsData[1] = last3CmprsData[0];
+	last3CmprsData[0] = value;
+}
+
+inline void listAdd_int32(int32_t last3CmprsData[3], int32_t value)
+{
+	last3CmprsData[2] = last3CmprsData[1];
+	last3CmprsData[1] = last3CmprsData[0];
+	last3CmprsData[0] = value;
+}
+
+inline void listAdd_float_group(float *groups, int *flags, char groupNum, float oriValue, float decValue, char* curGroupID)
+{
+	if(groupNum>=0)
+	{
+		if(flags[groupNum]==0)
+			flags[groupNum] = 1;
+		groups[groupNum] = decValue;		
+	}
+	else
+	{
+		groups[0] = decValue;
+		flags[0] = 1;		
+	}
+
+	if(oriValue>=0)
+		*curGroupID = groupNum+2; //+[-1,0,1,2,3,....,16] is mapped to [1,2,....,18]
+	else
+		*curGroupID = -(groupNum+2); //-[-1,0,1,2,3,....,16] is mapped to [-1,-2,....,-18]
+}
+
+inline void listAdd_double_group(double *groups, int *flags, char groupNum, double oriValue, double decValue, char* curGroupID)
+{
+	if(groupNum>=0)
+	{
+		if(flags[groupNum]==0)
+			flags[groupNum] = 1;
+		groups[groupNum] = decValue;		
+	}
+	else
+	{
+		groups[0] = decValue;
+		flags[0] = 1;		
+	}
+
+	if(oriValue>=0)
+		*curGroupID = groupNum+2; //+[-1,0,1,2,3,....,16] is mapped to [1,2,....,18]
+	else
+		*curGroupID = -(groupNum+2); //-[-1,0,1,2,3,....,16] is mapped to [-1,-2,....,-18]
+}
+
+/**
+ * Determine whether the prediction value minErr is valid.
+ * 
+ * */
+inline int validPrediction_double(double minErr, double precision)
+{
+	if(minErr<=precision)
+		return 1;
+	else
+		return 0;
+}
+
+inline int validPrediction_float(float minErr, float precision)
+{
+	if(minErr<=precision)
+		return 1;
+	else
+		return 0;
+}
+
+double* generateGroupErrBounds(int errorBoundMode, double realPrecision, double pwrErrBound)
+{
+	double pwrError;
+	double* result = (double*)malloc(GROUP_COUNT*sizeof(double));
+	int i = 0;
+	for(i=0;i<GROUP_COUNT;i++)
+	{
+		pwrError = ((double)pow(2, i))*pwrErrBound;
+		switch(errorBoundMode)
+		{
+		case ABS_AND_PW_REL:
+		case REL_AND_PW_REL: 
+			result[i] = pwrError<realPrecision?pwrError:realPrecision;
+			break;
+		case ABS_OR_PW_REL:
+		case REL_OR_PW_REL:
+			result[i] = pwrError<realPrecision?realPrecision:pwrError;
+			break;
+		case PW_REL:
+			result[i] = pwrError;
+			break;
+		}
+		
+	}
+	return result;
+}
+
+int generateGroupMaxIntervalCount(double* groupErrBounds)
+{
+	int i = 0;
+	int maxCount = 0, count = 0;
+	for(i=0;i<GROUP_COUNT;i++)
+	{
+		count = (int)(pow(2, i)/groupErrBounds[i] + 0.5);
+		if(maxCount<count)
+			maxCount = count;
+	}
+	
+	return maxCount;
+}
+
+void new_LossyCompressionElement(LossyCompressionElement *lce, int leadingNum, unsigned char* intMidBytes, 
+int intMidBytes_Length, int resiMidBitsLength, int resiBits)
+{
+	lce->leadingZeroBytes = leadingNum; //0,1,2,or 3
+	memcpy(lce->integerMidBytes,intMidBytes,intMidBytes_Length);
+	lce->integerMidBytes_Length = intMidBytes_Length; //they are mid_bits actually
+	lce->resMidBitsLength = resiMidBitsLength;
+	lce->residualMidBits = resiBits;
+}
+
+void updateLossyCompElement_Double(unsigned char* curBytes, unsigned char* preBytes, 
+		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce)
+{
+	int resiIndex, intMidBytes_Length = 0;
+	int leadingNum = compIdenticalLeadingBytesCount_double(preBytes, curBytes); //in fact, float is enough for both single-precision and double-precisiond ata.
+	int fromByteIndex = leadingNum;
+	int toByteIndex = reqBytesLength; //later on: should use "< toByteIndex" to tarverse....
+	if(fromByteIndex < toByteIndex)
+	{
+		intMidBytes_Length = reqBytesLength - leadingNum;
+		memcpy(lce->integerMidBytes, &(curBytes[fromByteIndex]), intMidBytes_Length);
+	}
+	int resiBits = 0;
+	if(resiBitsLength!=0)
+	{
+		resiIndex = reqBytesLength;
+		if(resiIndex < 8)
+			resiBits = (curBytes[resiIndex] & 0xFF) >> (8-resiBitsLength);
+	}
+	lce->leadingZeroBytes = leadingNum;
+	lce->integerMidBytes_Length = intMidBytes_Length;
+	lce->resMidBitsLength = resiBitsLength;
+	lce->residualMidBits = resiBits;
+}
+
+inline void updateLossyCompElement_Float(unsigned char* curBytes, unsigned char* preBytes, 
+		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce)
+{
+	int resiIndex, intMidBytes_Length = 0;
+	int leadingNum = compIdenticalLeadingBytesCount_float(preBytes, curBytes); //in fact, float is enough for both single-precision and double-precisiond ata.
+	int fromByteIndex = leadingNum;
+	int toByteIndex = reqBytesLength; //later on: should use "< toByteIndex" to tarverse....
+	if(fromByteIndex < toByteIndex)
+	{
+		intMidBytes_Length = reqBytesLength - leadingNum;
+		memcpy(lce->integerMidBytes, &(curBytes[fromByteIndex]), intMidBytes_Length);
+	}
+	int resiBits = 0;
+	if(resiBitsLength!=0)
+	{
+		resiIndex = reqBytesLength;
+		if(resiIndex < 8)
+			resiBits = (curBytes[resiIndex] & 0xFF) >> (8-resiBitsLength);
+	}
+	lce->leadingZeroBytes = leadingNum;
+	lce->integerMidBytes_Length = intMidBytes_Length;
+	lce->resMidBitsLength = resiBitsLength;
+	lce->residualMidBits = resiBits;
+}
+
+#pragma GCC diagnostic pop
--- a/deps/SZ/sz/src/DynamicByteArray.c
+++ b/deps/SZ/sz/src/DynamicByteArray.c
@ -0,0 +1,68 @@
+/**
+ *  @file DynamicByteArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Byte Array
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicByteArray.h"
+
+void new_DBA(DynamicByteArray **dba, size_t cap) {
+		*dba = (DynamicByteArray *)malloc(sizeof(DynamicByteArray));
+        (*dba)->size = 0;
+        (*dba)->capacity = cap;
+        (*dba)->array = (unsigned char*)malloc(sizeof(unsigned char)*cap);
+    }
+
+void convertDBAtoBytes(DynamicByteArray *dba, unsigned char** bytes)
+{
+	size_t size = dba->size;
+	if(size>0)
+		*bytes = (unsigned char*)malloc(size * sizeof(unsigned char));
+	else
+		*bytes = NULL;
+	memcpy(*bytes, dba->array, size*sizeof(unsigned char));	
+}
+
+void free_DBA(DynamicByteArray *dba)
+{
+	free(dba->array);
+	free(dba);
+}
+
+inline unsigned char getDBA_Data(DynamicByteArray *dba, size_t pos)
+{
+	if(pos>=dba->size)
+	{
+		printf("Error: wrong position of DBA (impossible case unless bugs elsewhere in the code?).\n");
+		exit(0);
+	}
+	return dba->array[pos];
+}
+
+inline void addDBA_Data(DynamicByteArray *dba, unsigned char value)
+{
+	if(dba->size==dba->capacity)
+	{
+		dba->capacity = dba->capacity << 1;
+		dba->array = (unsigned char *)realloc(dba->array, dba->capacity*sizeof(unsigned char));
+	}
+	dba->array[dba->size] = value;
+	dba->size ++;
+}
+
+inline void memcpyDBA_Data(DynamicByteArray *dba, unsigned char* data, size_t length)
+{
+	if(dba->size + length > dba->capacity)
+	{
+		dba->capacity = dba->size + length;
+		dba->array = (unsigned char *)realloc(dba->array, dba->capacity*sizeof(unsigned char));
+	}
+	memcpy(&(dba->array[dba->size]), data, length);
+	dba->size += length;
+}
--- a/deps/SZ/sz/src/DynamicDoubleArray.c
+++ b/deps/SZ/sz/src/DynamicDoubleArray.c
@ -0,0 +1,57 @@
+/**
+ *  @file DynamicFloatArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Float Array
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicDoubleArray.h"
+
+void new_DDA(DynamicDoubleArray **dda, size_t cap) {
+		*dda = (DynamicDoubleArray *)malloc(sizeof(DynamicDoubleArray));
+        (*dda)->size = 0;
+        (*dda)->capacity = cap;
+        (*dda)->array = (double*)malloc(sizeof(double)*cap);
+    }
+
+void convertDDAtoDoubles(DynamicDoubleArray *dba, double **data)
+{
+	size_t size = dba->size;
+	if(size>0)
+		*data = (double*)malloc(size * sizeof(double));
+	else
+		*data = NULL;
+	memcpy(*data, dba->array, size*sizeof(double));	
+}
+
+void free_DDA(DynamicDoubleArray *dda)
+{
+	free(dda->array);
+	free(dda);
+}
+
+double getDDA_Data(DynamicDoubleArray *dda, size_t pos)
+{
+	if(pos>=dda->size)
+	{
+		printf("Error: wrong position of DIA.\n");
+		exit(0);
+	}
+	return dda->array[pos];
+}
+
+void addDDA_Data(DynamicDoubleArray *dda, double value)
+{
+	if(dda->size==dda->capacity)
+	{
+		dda->capacity *= 2;
+		dda->array = (double *)realloc(dda->array, dda->capacity*sizeof(double));
+	}
+	dda->array[dda->size] = value;
+	dda->size ++;
+}
--- a/deps/SZ/sz/src/DynamicFloatArray.c
+++ b/deps/SZ/sz/src/DynamicFloatArray.c
@ -0,0 +1,57 @@
+/**
+ *  @file DynamicFloatArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Float Array
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicFloatArray.h"
+
+void new_DFA(DynamicFloatArray **dfa, size_t cap) {
+		*dfa = (DynamicFloatArray *)malloc(sizeof(DynamicFloatArray));
+        (*dfa)->size = 0;
+        (*dfa)->capacity = cap;
+        (*dfa)->array = (float*)malloc(sizeof(float)*cap);
+    }
+
+void convertDFAtoFloats(DynamicFloatArray *dfa, float **data)
+{
+	size_t size = dfa->size;
+	if(size>0)
+		*data = (float*)malloc(size * sizeof(float));
+	else
+		*data = NULL;
+	memcpy(*data, dfa->array, size*sizeof(float));	
+}
+
+void free_DFA(DynamicFloatArray *dfa)
+{
+	free(dfa->array);
+	free(dfa);
+}
+
+float getDFA_Data(DynamicFloatArray *dfa, size_t pos)
+{
+	if(pos>=dfa->size)
+	{
+		printf("Error: wrong position of DIA.\n");
+		exit(0);
+	}
+	return dfa->array[pos];
+}
+
+void addDFA_Data(DynamicFloatArray *dfa, float value)
+{
+	if(dfa->size==dfa->capacity)
+	{
+		dfa->capacity *= 2;
+		dfa->array = (float *)realloc(dfa->array, dfa->capacity*sizeof(float));
+	}
+	dfa->array[dfa->size] = value;
+	dfa->size++;
+}
--- a/deps/SZ/sz/src/DynamicIntArray.c
+++ b/deps/SZ/sz/src/DynamicIntArray.c
@ -0,0 +1,57 @@
+/**
+ *  @file DynamicIntArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Int Array
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicIntArray.h"
+
+void new_DIA(DynamicIntArray **dia, size_t cap) {
+		*dia = (DynamicIntArray *)malloc(sizeof(DynamicIntArray));
+        (*dia)->size = 0;
+        (*dia)->capacity = cap;
+        (*dia)->array = (unsigned char*)malloc(sizeof(unsigned char)*cap);
+    }
+
+void convertDIAtoInts(DynamicIntArray *dia, unsigned char **data)
+{
+	size_t size = dia->size;
+	if(size>0)
+		*data = (unsigned char*)malloc(size * sizeof(char));
+	else
+		*data = NULL;
+	memcpy(*data, dia->array, size*sizeof(unsigned char));	
+}
+
+void free_DIA(DynamicIntArray *dia)
+{
+	free(dia->array);
+	free(dia);
+}
+
+int getDIA_Data(DynamicIntArray *dia, size_t pos)
+{
+	if(pos>=dia->size)
+	{
+		printf("Error: wrong position of DIA.\n");
+		exit(0);
+	}
+	return dia->array[pos];
+}
+
+inline void addDIA_Data(DynamicIntArray *dia, int value)
+{
+	if(dia->size==dia->capacity)
+	{
+		dia->capacity = dia->capacity << 1;
+		dia->array = (unsigned char *)realloc(dia->array, dia->capacity*sizeof(unsigned char));
+	}
+	dia->array[dia->size] = (unsigned char)value;
+	dia->size ++;
+}
--- a/deps/SZ/sz/src/Huffman.c
+++ b/deps/SZ/sz/src/Huffman.c
@ -0,0 +1,932 @@
+/**
+ *  @file Huffman.c
+ *  @author Sheng Di
+ *  @date Aug., 2016
+ *  @brief Customized Huffman Encoding, Compression and Decompression functions
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "Huffman.h"
+#include "sz.h"
+
+
+HuffmanTree* createHuffmanTree(int stateNum)
+{			
+	HuffmanTree *huffmanTree = (HuffmanTree*)malloc(sizeof(HuffmanTree));
+	memset(huffmanTree, 0, sizeof(HuffmanTree));
+	huffmanTree->stateNum = stateNum;
+	huffmanTree->allNodes = 2*stateNum;
+	
+	huffmanTree->pool = (struct node_t*)malloc(huffmanTree->allNodes*2*sizeof(struct node_t));
+	huffmanTree->qqq = (node*)malloc(huffmanTree->allNodes*2*sizeof(node));
+	huffmanTree->code = (unsigned long**)malloc(huffmanTree->stateNum*sizeof(unsigned long*));
+	huffmanTree->cout = (unsigned char *)malloc(huffmanTree->stateNum*sizeof(unsigned char));
+	
+	memset(huffmanTree->pool, 0, huffmanTree->allNodes*2*sizeof(struct node_t));
+	memset(huffmanTree->qqq, 0, huffmanTree->allNodes*2*sizeof(node));
+    memset(huffmanTree->code, 0, huffmanTree->stateNum*sizeof(unsigned long*));
+    memset(huffmanTree->cout, 0, huffmanTree->stateNum*sizeof(unsigned char));
+	huffmanTree->qq = huffmanTree->qqq - 1;
+	huffmanTree->n_nodes = 0;
+    huffmanTree->n_inode = 0;
+    huffmanTree->qend = 1;	
+    
+    return huffmanTree;
+}
+
+HuffmanTree* createDefaultHuffmanTree()
+{
+	int maxRangeRadius = 32768;
+	int stateNum = maxRangeRadius << 1; //*2
+
+    return createHuffmanTree(stateNum);
+}
+ 
+node new_node(HuffmanTree* huffmanTree, size_t freq, unsigned int c, node a, node b)
+{
+	node n = huffmanTree->pool + huffmanTree->n_nodes++;
+	if (freq) 
+	{
+		n->c = c;
+		n->freq = freq;
+		n->t = 1;
+	}
+	else {
+		n->left = a; 
+		n->right = b;
+		n->freq = a->freq + b->freq;
+		n->t = 0;
+		//n->c = 0;
+	}
+	return n;
+}
+ 
+node new_node2(HuffmanTree *huffmanTree, unsigned int c, unsigned char t)
+{
+	huffmanTree->pool[huffmanTree->n_nodes].c = c;
+	huffmanTree->pool[huffmanTree->n_nodes].t = t;
+	return huffmanTree->pool + huffmanTree->n_nodes++;
+} 
+ 
+/* priority queue */
+void qinsert(HuffmanTree *huffmanTree, node n)
+{
+	int j, i = huffmanTree->qend++;
+	while ((j = (i>>1)))  //j=i/2
+	{
+		if (huffmanTree->qq[j]->freq <= n->freq) break;
+		huffmanTree->qq[i] = huffmanTree->qq[j], i = j;
+	}
+	huffmanTree->qq[i] = n;
+}
+ 
+node qremove(HuffmanTree* huffmanTree)
+{
+	int i, l;
+	node n = huffmanTree->qq[i = 1];
+	node p;
+	if (huffmanTree->qend < 2) return 0;
+	huffmanTree->qend --;
+	huffmanTree->qq[i] = huffmanTree->qq[huffmanTree->qend];
+	
+	while ((l = (i<<1)) < huffmanTree->qend)  //l=(i*2)
+	{
+		if (l + 1 < huffmanTree->qend && huffmanTree->qq[l + 1]->freq < huffmanTree->qq[l]->freq) l++;
+		if(huffmanTree->qq[i]->freq > huffmanTree->qq[l]->freq)
+		{
+			p = huffmanTree->qq[i];
+			huffmanTree->qq[i] = huffmanTree->qq[l];
+			huffmanTree->qq[l] = p;
+			i = l;			
+		}	
+		else
+		{
+			break;
+		}
+		
+	}
+	
+	return n;
+}
+ 
+/* walk the tree and put 0s and 1s */
+/**
+ * @out1 should be set to 0.
+ * @out2 should be 0 as well.
+ * @index: the index of the byte
+ * */
+void build_code(HuffmanTree *huffmanTree, node n, int len, unsigned long out1, unsigned long out2)
+{
+	if (n->t) {
+		huffmanTree->code[n->c] = (unsigned long*)malloc(2*sizeof(unsigned long));
+		if(len<=64)
+		{
+			(huffmanTree->code[n->c])[0] = out1 << (64 - len);
+			(huffmanTree->code[n->c])[1] = out2;
+		}
+		else
+		{
+			(huffmanTree->code[n->c])[0] = out1;
+			(huffmanTree->code[n->c])[1] = out2 << (128 - len);
+		}
+		huffmanTree->cout[n->c] = (unsigned char)len;
+		return;
+	}
+	int index = len >> 6; //=len/64
+	if(index == 0)
+	{
+		out1 = out1 << 1;
+		out1 = out1 | 0;
+		build_code(huffmanTree, n->left, len + 1, out1, 0);
+		out1 = out1 | 1;
+		build_code(huffmanTree, n->right, len + 1, out1, 0);		
+	}
+	else
+	{
+		if(len%64!=0)
+			out2 = out2 << 1;
+		out2 = out2 | 0;
+		build_code(huffmanTree, n->left, len + 1, out1, out2);
+		out2 = out2 | 1;
+		build_code(huffmanTree, n->right, len + 1, out1, out2);	
+	}
+}
+
+/**
+ * Compute the frequency of the data and build the Huffman tree
+ * @param HuffmanTree* huffmanTree (output)
+ * @param int *s (input)
+ * @param size_t length (input)
+ * */
+void init(HuffmanTree* huffmanTree, int *s, size_t length)
+{
+	size_t i, index;
+	size_t *freq = (size_t *)malloc(huffmanTree->allNodes*sizeof(size_t));
+	memset(freq, 0, huffmanTree->allNodes*sizeof(size_t));
+	for(i = 0;i < length;i++)
+	{
+		index = s[i];
+		freq[index]++;
+	}
+
+	for (i = 0; i < huffmanTree->allNodes; i++)
+		if (freq[i])
+			qinsert(huffmanTree, new_node(huffmanTree, freq[i], i, 0, 0));
+
+	while (huffmanTree->qend > 2)
+		qinsert(huffmanTree, new_node(huffmanTree, 0, 0, qremove(huffmanTree), qremove(huffmanTree)));
+
+	build_code(huffmanTree, huffmanTree->qq[1], 0, 0, 0);
+	free(freq);
+}
+
+void init_static(HuffmanTree* huffmanTree, int *s, size_t length)
+{
+	size_t i;
+	size_t *freq = (size_t *)malloc(huffmanTree->allNodes*sizeof(size_t));
+	memset(freq, 0, huffmanTree->allNodes*sizeof(size_t));
+
+
+	for (i = 0; i < huffmanTree->allNodes; i++)
+		if (freq[i])
+			qinsert(huffmanTree, new_node(huffmanTree, freq[i], i, 0, 0));
+
+	while (huffmanTree->qend > 2)
+		qinsert(huffmanTree, new_node(huffmanTree, 0, 0, qremove(huffmanTree), qremove(huffmanTree)));
+
+	build_code(huffmanTree, huffmanTree->qq[1], 0, 0, 0);
+	free(freq);
+}
+ 
+void encode(HuffmanTree *huffmanTree, int *s, size_t length, unsigned char *out, size_t *outSize)
+{
+	size_t i = 0;
+	unsigned char bitSize = 0, byteSize, byteSizep;
+	int state;
+	unsigned char *p = out;
+	int lackBits = 0;
+	//long totalBitSize = 0, maxBitSize = 0, bitSize21 = 0, bitSize32 = 0;
+	for (i = 0;i<length;i++) 
+	{
+		state = s[i];
+		bitSize = huffmanTree->cout[state];	
+		
+		//printf("%d %d : %d %u\n",i, state, bitSize, (code[state])[0] >> (64-cout[state])); 
+		//debug: compute the average bitSize and the count that is over 32... 	
+		/*if(bitSize>=21)
+			bitSize21++;
+		if(bitSize>=32)
+			bitSize32++;
+		if(maxBitSize<bitSize)
+			maxBitSize = bitSize;
+		totalBitSize+=bitSize;*/
+
+		if(lackBits==0)
+		{
+			byteSize = bitSize%8==0 ? bitSize/8 : bitSize/8+1; //it's equal to the number of bytes involved (for *outSize)
+			byteSizep = bitSize/8; //it's used to move the pointer p for next data
+			if(byteSize<=8)
+			{
+				longToBytes_bigEndian(p, (huffmanTree->code[state])[0]);
+				p += byteSizep;
+			}
+			else //byteSize>8
+			{
+				longToBytes_bigEndian(p, (huffmanTree->code[state])[0]);
+				p += 8;
+				longToBytes_bigEndian(p, (huffmanTree->code[state])[1]);
+				p += (byteSizep - 8);
+			}
+			*outSize += byteSize;
+			lackBits = bitSize%8==0 ? 0 : 8 - bitSize%8;
+		}
+		else
+		{
+			*p = (*p) | (unsigned char)((huffmanTree->code[state])[0] >> (64 - lackBits));
+			if(lackBits < bitSize)
+			{
+				p++;
+				//(*outSize)++;
+				long newCode = (huffmanTree->code[state])[0] << lackBits;
+				longToBytes_bigEndian(p, newCode);
+
+				if(bitSize<=64)
+				{
+					bitSize -= lackBits;
+					byteSize = bitSize%8==0 ? bitSize/8 : bitSize/8+1;
+					byteSizep = bitSize/8;
+					p += byteSizep;
+					(*outSize)+=byteSize;
+					lackBits = bitSize%8==0 ? 0 : 8 - bitSize%8;
+				}
+				else //bitSize > 64
+				{
+					byteSizep = 7; //must be 7 bytes, because lackBits!=0
+					p+=byteSizep;
+					(*outSize)+=byteSize;
+
+					bitSize -= 64;
+					if(lackBits < bitSize)
+					{
+						*p = (*p) | (unsigned char)((huffmanTree->code[state])[0] >> (64 - lackBits));
+						p++;
+						//(*outSize)++;
+						newCode = (huffmanTree->code[state])[1] << lackBits;
+						longToBytes_bigEndian(p, newCode);
+						bitSize -= lackBits;
+						byteSize = bitSize%8==0 ? bitSize/8 : bitSize/8+1;
+						byteSizep = bitSize/8;
+						p += byteSizep;
+						(*outSize)+=byteSize;
+						lackBits = bitSize%8==0 ? 0 : 8 - bitSize%8;
+					}
+					else //lackBits >= bitSize
+					{
+						*p = (*p) | (unsigned char)((huffmanTree->code[state])[0] >> (64 - bitSize));
+						lackBits -= bitSize;
+					}
+				}
+			}
+			else //lackBits >= bitSize
+			{
+				lackBits -= bitSize;
+				if(lackBits==0)
+					p++;
+			}
+		}
+	}
+//	for(i=0;i<stateNum;i++)
+//		if(code[i]!=NULL) free(code[i]);
+	/*printf("max bitsize = %d\n", maxBitSize);
+	printf("bitSize21 ratio = %f\n", ((float)bitSize21)/length);
+	printf("bitSize32 ratio = %f\n", ((float)bitSize32)/length);
+	printf("avg bit size = %f\n", ((float)totalBitSize)/length);*/
+}
+ 
+void decode(unsigned char *s, size_t targetLength, node t, int *out)
+{
+	size_t i = 0, byteIndex = 0, count = 0;
+	int r; 
+	node n = t;
+	
+	if(n->t) //root->t==1 means that all state values are the same (constant)
+	{
+		for(count=0;count<targetLength;count++)
+			out[count] = n->c;
+		return;
+	}
+	
+	for(i=0;count<targetLength;i++)
+	{
+		
+		byteIndex = i>>3; //i/8
+		r = i%8;
+		if(((s[byteIndex] >> (7-r)) & 0x01) == 0)
+			n = n->left;
+		else
+			n = n->right;
+
+		if (n->t) {
+			//putchar(n->c); 
+			out[count] = n->c;
+			n = t; 
+			count++;
+		}
+	}
+//	putchar('\n');
+	if (t != n) printf("garbage input\n");
+	return;
+}
+
+void decode_MSST19(unsigned char *s, size_t targetLength, node t, int *out, int maxBits)
+{
+	size_t count = 0;
+	node n = t;
+
+	if(n->t) //root->t==1 means that all state values are the same (constant)
+	{
+		for(count=0;count<targetLength;count++)
+			out[count] = n->c;
+		return;
+	}
+
+	if(maxBits > 16){
+		maxBits = 16;
+	}
+
+    int tableSize = 1 << maxBits;
+    int* valueTable = (int*)malloc(tableSize * sizeof(int));
+    uint8_t* lengthTable = (uint8_t*)malloc(tableSize * sizeof(int));
+    node* nodeTable = (node*)malloc(tableSize * sizeof(node));
+    uint32_t maskTable[maxBits+8];
+    int j;
+    for(uint32_t i=0; i<tableSize; i++){
+        n = t;
+        j = 0;
+        while(!n->t && j < maxBits){
+            uint32_t res = i >> (maxBits - j - 1);
+            if((res & 0x00000001) == 0){
+                n = n->left;
+            }else{
+                n = n->right;
+            }
+            j++;
+        }
+        if(!n->t){
+        	nodeTable[i] = n;
+        	valueTable[i] = -1;
+        	lengthTable[i] = maxBits;
+        }else{
+			valueTable[i] = n->c;
+			lengthTable[i] = j;
+        }
+    }
+    for(int i=0; i<maxBits+8; i++){
+        maskTable[i] = (1 << (maxBits+8-i-1)) - 1;
+    }
+
+    int leftBits = 0;
+	uint32_t currentValue = 0;
+	size_t i = 0;
+
+    while(count<targetLength)
+	{
+	    while(leftBits < maxBits){
+	        currentValue = currentValue << 8;
+	        currentValue += s[i];
+	        leftBits += 8;
+	        i++;
+	    }
+
+        uint32_t index = currentValue >> (leftBits - maxBits);
+        int value = valueTable[index];
+        if(value != -1){
+			out[count] = value;
+			int bitLength = lengthTable[index];
+			leftBits -= bitLength;
+			uint32_t avoidHeadMask = maskTable[maxBits + 8 - leftBits - 1];
+			currentValue = (currentValue & avoidHeadMask);
+			count++;
+        }else{
+			int bitLength = lengthTable[index];
+			leftBits -= bitLength;
+        	n = nodeTable[index];
+        	while(!n->t){
+        		if(!leftBits){
+					currentValue = currentValue << 8;
+					currentValue += s[i];
+					leftBits += 8;
+					i++;
+        		}
+				if(((currentValue >> (leftBits - 1)) & 0x01) == 0)
+					n = n->left;
+				else
+					n = n->right;
+				leftBits--;
+        	}
+        	currentValue &= maskTable[maxBits + 8 - leftBits - 1];
+			out[count] = n->c;
+			count++;
+        }
+
+	}
+    free(valueTable);
+    free(lengthTable);
+    free(nodeTable);
+	return;
+}
+void pad_tree_uchar(HuffmanTree* huffmanTree, unsigned char* L, unsigned char* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	C[i] = root->c;
+	t[i] = root->t;
+	node lroot = root->left;
+	if(lroot!=0)
+	{
+		huffmanTree->n_inode++;
+		L[i] = huffmanTree->n_inode;
+		pad_tree_uchar(huffmanTree, L,R,C,t, huffmanTree->n_inode, lroot);
+	}
+	node rroot = root->right;
+	if(rroot!=0)
+	{
+		huffmanTree->n_inode++;
+		R[i] = huffmanTree->n_inode;
+		pad_tree_uchar(huffmanTree, L,R,C,t, huffmanTree->n_inode, rroot);
+	}
+}  
+
+void pad_tree_ushort(HuffmanTree* huffmanTree, unsigned short* L, unsigned short* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	C[i] = root->c;
+	t[i] = root->t;
+	node lroot = root->left;
+	if(lroot!=0)
+	{
+		huffmanTree->n_inode++;
+		L[i] = huffmanTree->n_inode;
+		pad_tree_ushort(huffmanTree,L,R,C,t,huffmanTree->n_inode, lroot);
+	}
+	node rroot = root->right;
+	if(rroot!=0)
+	{
+		huffmanTree->n_inode++;
+		R[i] = huffmanTree->n_inode;
+		pad_tree_ushort(huffmanTree,L,R,C,t,huffmanTree->n_inode, rroot);
+	}	
+}
+
+void pad_tree_uint(HuffmanTree* huffmanTree, unsigned int* L, unsigned int* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	C[i] = root->c;
+	t[i] = root->t;
+	node lroot = root->left;
+	if(lroot!=0)
+	{
+		huffmanTree->n_inode++;
+		L[i] = huffmanTree->n_inode;
+		pad_tree_uint(huffmanTree,L,R,C,t,huffmanTree->n_inode, lroot);
+	}
+	node rroot = root->right;
+	if(rroot!=0)
+	{
+		huffmanTree->n_inode++;
+		R[i] = huffmanTree->n_inode;
+		pad_tree_uint(huffmanTree,L,R,C,t,huffmanTree->n_inode, rroot);
+	}
+}
+ 
+unsigned int convert_HuffTree_to_bytes_anyStates(HuffmanTree* huffmanTree, int nodeCount, unsigned char** out) 
+{
+	if(nodeCount<=256)
+	{
+		unsigned char* L = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(L, 0, nodeCount*sizeof(unsigned char));
+		unsigned char* R = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(R, 0, nodeCount*sizeof(unsigned char));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(C, 0, nodeCount*sizeof(unsigned int));
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));
+
+		pad_tree_uchar(huffmanTree,L,R,C,t,0,huffmanTree->qq[1]);
+
+		unsigned int totalSize = 1+3*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);	
+		*out = (unsigned char*)malloc(totalSize*sizeof(unsigned char));
+		(*out)[0] = (unsigned char)sysEndianType;
+		memcpy(*out+1, L, nodeCount*sizeof(unsigned char));
+		memcpy((*out)+1+nodeCount*sizeof(unsigned char),R,nodeCount*sizeof(unsigned char));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned char),C,nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int), t, nodeCount*sizeof(unsigned char));
+		free(L);
+		free(R);
+		free(C);
+		free(t);
+		return totalSize;
+
+	}
+	else if(nodeCount<=65536)
+	{
+		unsigned short* L = (unsigned short*)malloc(nodeCount*sizeof(unsigned short));
+		memset(L, 0, nodeCount*sizeof(unsigned short));
+		unsigned short* R = (unsigned short*)malloc(nodeCount*sizeof(unsigned short));
+		memset(R, 0, nodeCount*sizeof(unsigned short));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));	
+		memset(C, 0, nodeCount*sizeof(unsigned int));		
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));		
+		pad_tree_ushort(huffmanTree,L,R,C,t,0,huffmanTree->qq[1]);
+		unsigned int totalSize = 1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned char) + nodeCount*sizeof(unsigned int);
+		*out = (unsigned char*)malloc(totalSize);
+		(*out)[0] = (unsigned char)sysEndianType;		
+		memcpy(*out+1, L, nodeCount*sizeof(unsigned short));
+		memcpy((*out)+1+nodeCount*sizeof(unsigned short),R,nodeCount*sizeof(unsigned short));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned short),C,nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned int),t,nodeCount*sizeof(unsigned char));
+		free(L);
+		free(R);
+		free(C);
+		free(t);		
+		return totalSize;
+	}
+	else //nodeCount>65536
+	{
+		unsigned int* L = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(L, 0, nodeCount*sizeof(unsigned int));
+		unsigned int* R = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(R, 0, nodeCount*sizeof(unsigned int));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));	
+		memset(C, 0, nodeCount*sizeof(unsigned int));
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));
+		pad_tree_uint(huffmanTree, L,R,C,t,0,huffmanTree->qq[1]);
+		
+		//debug
+		//node root = new_node2(0,0);
+		//unpad_tree_uint(L,R,C,t,0,root);		
+		
+		unsigned int totalSize = 1+3*nodeCount*sizeof(unsigned int)+nodeCount*sizeof(unsigned char);
+		*out = (unsigned char*)malloc(totalSize);
+		(*out)[0] = (unsigned char)sysEndianType;
+		memcpy(*out+1, L, nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+nodeCount*sizeof(unsigned int),R,nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned int),C,nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+3*nodeCount*sizeof(unsigned int),t,nodeCount*sizeof(unsigned char));
+		free(L);
+		free(R);
+		free(C);
+		free(t);
+		return totalSize;		
+	}
+}
+
+void unpad_tree_uchar(HuffmanTree* huffmanTree, unsigned char* L, unsigned char* R, unsigned int* C, unsigned char *t, unsigned int i, node root)
+{
+	//root->c = C[i];
+	if(root->t==0)
+	{
+		unsigned char l, r;
+		l = L[i];
+		if(l!=0)
+		{
+			node lroot = new_node2(huffmanTree,C[l],t[l]);
+			root->left = lroot;
+			unpad_tree_uchar(huffmanTree,L,R,C,t,l,lroot);
+		}
+		r = R[i];
+		if(r!=0)
+		{
+			node rroot = new_node2(huffmanTree,C[r],t[r]);
+			root->right = rroot;
+			unpad_tree_uchar(huffmanTree,L,R,C,t,r,rroot);
+		}
+	}
+}
+
+void unpad_tree_ushort(HuffmanTree* huffmanTree, unsigned short* L, unsigned short* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	//root->c = C[i];
+	if(root->t==0)
+	{
+		unsigned short l, r;
+		l = L[i];
+		if(l!=0)
+		{
+			node lroot = new_node2(huffmanTree,C[l],t[l]);
+			root->left = lroot;
+			unpad_tree_ushort(huffmanTree,L,R,C,t,l,lroot);
+		}
+		r = R[i];
+		if(r!=0)
+		{
+			node rroot = new_node2(huffmanTree,C[r],t[r]);
+			root->right = rroot;
+			unpad_tree_ushort(huffmanTree,L,R,C,t,r,rroot);
+		}
+	}
+}
+
+void unpad_tree_uint(HuffmanTree* huffmanTree, unsigned int* L, unsigned int* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	//root->c = C[i];
+	if(root->t==0)
+	{
+		unsigned int l, r;
+		l = L[i];
+		if(l!=0)
+		{
+			node lroot = new_node2(huffmanTree,C[l],t[l]);
+			root->left = lroot;
+			unpad_tree_uint(huffmanTree,L,R,C,t,l,lroot);
+		}
+		r = R[i];
+		if(r!=0)
+		{
+			node rroot = new_node2(huffmanTree,C[r],t[r]);
+			root->right = rroot;
+			unpad_tree_uint(huffmanTree,L,R,C,t,r,rroot);
+		}
+	}
+}
+
+node reconstruct_HuffTree_from_bytes_anyStates(HuffmanTree *huffmanTree, unsigned char* bytes, int nodeCount)
+{
+	if(nodeCount<=256)
+	{
+		unsigned char* L = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(L, 0, nodeCount*sizeof(unsigned char));
+		unsigned char* R = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(R, 0, nodeCount*sizeof(unsigned char));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(C, 0, nodeCount*sizeof(unsigned int));
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));
+		unsigned char cmpSysEndianType = bytes[0];
+		if(cmpSysEndianType!=(unsigned char)sysEndianType)
+		{
+			unsigned char* p = (unsigned char*)(bytes+1+2*nodeCount*sizeof(unsigned char));
+			size_t i = 0, size = nodeCount*sizeof(unsigned int);
+			while(1)
+			{
+				symTransform_4bytes(p);
+				i+=sizeof(unsigned int);
+				if(i<size)
+					p+=sizeof(unsigned int);
+				else
+					break;
+			}		
+		}
+		memcpy(L, bytes+1, nodeCount*sizeof(unsigned char));
+		memcpy(R, bytes+1+nodeCount*sizeof(unsigned char), nodeCount*sizeof(unsigned char));
+		memcpy(C, bytes+1+2*nodeCount*sizeof(unsigned char), nodeCount*sizeof(unsigned int));	
+		memcpy(t, bytes+1+2*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned char));
+		node root = new_node2(huffmanTree, C[0],t[0]);
+		unpad_tree_uchar(huffmanTree,L,R,C,t,0,root);
+		free(L);
+		free(R);
+		free(C);
+		free(t);
+		return root;
+	}
+	else if(nodeCount<=65536)
+	{
+		unsigned short* L = (unsigned short*)malloc(nodeCount*sizeof(unsigned short));
+		memset(L, 0, nodeCount*sizeof(unsigned short));
+		unsigned short* R = (unsigned short*)malloc(nodeCount*sizeof(unsigned short));
+		memset(R, 0, nodeCount*sizeof(unsigned short));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));	
+		memset(C, 0, nodeCount*sizeof(unsigned int));		
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));	
+				
+		unsigned char cmpSysEndianType = bytes[0];	
+		if(cmpSysEndianType!=(unsigned char)sysEndianType)
+		{
+			unsigned char* p = (unsigned char*)(bytes+1);
+			size_t i = 0, size = 2*nodeCount*sizeof(unsigned short);
+			
+			while(1)
+			{
+				symTransform_2bytes(p);
+				i+=sizeof(unsigned short);
+				if(i<size)
+					p+=sizeof(unsigned short);
+				else
+					break;
+			}
+			
+			size = nodeCount*sizeof(unsigned int);
+			while(1)
+			{
+				symTransform_4bytes(p);
+				i+=sizeof(unsigned int);
+				if(i<size)
+					p+=sizeof(unsigned int);
+				else
+					break;				
+			}
+		}
+
+		memcpy(L, bytes+1, nodeCount*sizeof(unsigned short));
+		memcpy(R, bytes+1+nodeCount*sizeof(unsigned short), nodeCount*sizeof(unsigned short));
+		memcpy(C, bytes+1+2*nodeCount*sizeof(unsigned short), nodeCount*sizeof(unsigned int));	
+
+		memcpy(t, bytes+1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned char));	
+
+		node root = new_node2(huffmanTree,0,0);
+		unpad_tree_ushort(huffmanTree,L,R,C,t,0,root);
+		free(L);
+		free(R);
+		free(C);
+		free(t);		
+		return root;				
+	}
+	else //nodeCount>65536
+	{
+		unsigned int* L = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(L, 0, nodeCount*sizeof(unsigned int));
+		unsigned int* R = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(R, 0, nodeCount*sizeof(unsigned int));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));	
+		memset(C, 0, nodeCount*sizeof(unsigned int));
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));
+		unsigned char cmpSysEndianType = bytes[0];
+		if(cmpSysEndianType!=(unsigned char)sysEndianType)
+		{
+			unsigned char* p = (unsigned char*)(bytes+1);
+			size_t i = 0, size = 3*nodeCount*sizeof(unsigned int);
+			while(1)
+			{
+				symTransform_4bytes(p);
+				i+=sizeof(unsigned int);
+				if(i<size)
+					p+=sizeof(unsigned int);
+				else
+					break;
+			}
+		}
+
+		memcpy(L, bytes+1, nodeCount*sizeof(unsigned int));
+		memcpy(R, bytes+1+nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned int));
+		memcpy(C, bytes+1+2*nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned int));	
+	
+		memcpy(t, bytes+1+3*nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned char));			
+					
+		node root = new_node2(huffmanTree,0,0);
+		unpad_tree_uint(huffmanTree,L,R,C,t,0,root);
+		free(L);
+		free(R);
+		free(C);
+		free(t);
+		return root;
+	}
+}
+
+void encode_withTree(HuffmanTree* huffmanTree, int *s, size_t length, unsigned char **out, size_t *outSize)
+{
+	size_t i; 
+	int nodeCount = 0;
+	unsigned char *treeBytes, buffer[4];
+	
+	init(huffmanTree, s, length);
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++; 
+	nodeCount = nodeCount*2-1;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree,nodeCount, &treeBytes);
+	//printf("treeByteSize = %d\n", treeByteSize);
+
+	*out = (unsigned char*)malloc(length*sizeof(int)+treeByteSize);
+	intToBytes_bigEndian(buffer, nodeCount);
+	memcpy(*out, buffer, 4);
+	intToBytes_bigEndian(buffer, huffmanTree->stateNum/2); //real number of intervals
+	memcpy(*out+4, buffer, 4);
+	memcpy(*out+8, treeBytes, treeByteSize);
+	free(treeBytes);
+	size_t enCodeSize = 0;
+	encode(huffmanTree, s, length, *out+8+treeByteSize, &enCodeSize);
+	*outSize = 8+treeByteSize+enCodeSize;
+}
+
+int encode_withTree_MSST19(HuffmanTree* huffmanTree, int *s, size_t length, unsigned char **out, size_t *outSize)
+{
+	//struct ClockPoint clockPointInit;
+	//TimeDurationStart("init", &clockPointInit);
+	size_t i;
+	int nodeCount = 0;
+	unsigned char *treeBytes, buffer[4];
+
+	init(huffmanTree, s, length);
+
+	int maxBits = 0;
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]){
+			nodeCount++;
+			if(huffmanTree->cout[i] > maxBits) maxBits = huffmanTree->cout[i];
+		}
+	nodeCount = nodeCount*2-1;
+	//TimeDurationEnd(&clockPointInit);
+	//struct ClockPoint clockPointST;
+	//TimeDurationStart("save tree", &clockPointST);
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree,nodeCount, &treeBytes);
+	//printf("treeByteSize = %d\n", treeByteSize);
+
+	*out = (unsigned char*)malloc(length*sizeof(int)+treeByteSize);
+	intToBytes_bigEndian(buffer, nodeCount);
+	memcpy(*out, buffer, 4);
+	intToBytes_bigEndian(buffer, huffmanTree->stateNum/2); //real number of intervals
+	memcpy(*out+4, buffer, 4);
+	memcpy(*out+8, treeBytes, treeByteSize);
+	free(treeBytes);
+	size_t enCodeSize = 0;
+	//TimeDurationEnd(&clockPointST);
+	//struct ClockPoint clockPointEncode;
+	//TimeDurationStart("encode", &clockPointEncode);
+	encode(huffmanTree, s, length, *out+8+treeByteSize, &enCodeSize);
+	*outSize = 8+treeByteSize+enCodeSize;
+	//TimeDurationEnd(&clockPointEncode);
+	//unsigned short state[length];
+	//decode(*out+4+treeByteSize, enCodeSize, qqq[0], state);
+	//printf("dataSeriesLength=%d",length );
+	return maxBits;
+}
+
+/**
+ * @par *out rememmber to allocate targetLength short_type data for it beforehand.
+ * 
+ * */
+void decode_withTree(HuffmanTree* huffmanTree, unsigned char *s, size_t targetLength, int *out)
+{
+	size_t encodeStartIndex;
+	size_t nodeCount = bytesToInt_bigEndian(s);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,s+8, nodeCount);
+
+	//sdi: Debug
+/*	build_code(root, 0, 0, 0);
+	int i;
+	unsigned long code_1, code_2;
+	for (i = 0; i < stateNum; i++)
+		if (code[i])
+		{
+			printf("%d: %lu,%lu ; %u\n", i, (code[i])[0],(code[i])[1], cout[i]);
+			//code_1 = (code[i])[0];
+		}*/
+
+	if(nodeCount<=256)
+		encodeStartIndex = 1+3*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);
+	else if(nodeCount<=65536)
+		encodeStartIndex = 1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);
+	else
+		encodeStartIndex = 1+3*nodeCount*sizeof(unsigned int)+nodeCount*sizeof(unsigned char);
+	decode(s+8+encodeStartIndex, targetLength, root, out);
+}
+
+void decode_withTree_MSST19(HuffmanTree* huffmanTree, unsigned char *s, size_t targetLength, int *out, int maxBits)
+{
+	size_t encodeStartIndex;
+	size_t nodeCount = bytesToInt_bigEndian(s);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,s+8, nodeCount);
+
+	//sdi: Debug
+/*	build_code(root, 0, 0, 0);
+	int i;
+	unsigned long code_1, code_2;
+	for (i = 0; i < stateNum; i++)
+		if (code[i])
+		{
+			printf("%d: %lu,%lu ; %u\n", i, (code[i])[0],(code[i])[1], cout[i]);
+			//code_1 = (code[i])[0];
+		}*/
+
+	if(nodeCount<=256)
+		encodeStartIndex = 1+3*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);
+	else if(nodeCount<=65536)
+		encodeStartIndex = 1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);
+	else
+		encodeStartIndex = 1+3*nodeCount*sizeof(unsigned int)+nodeCount*sizeof(unsigned char);
+
+	decode_MSST19(s+8+encodeStartIndex, targetLength, root, out, maxBits);
+}
+
+void SZ_ReleaseHuffman(HuffmanTree* huffmanTree)
+{
+	size_t i;
+	free(huffmanTree->pool);
+	huffmanTree->pool = NULL;
+	free(huffmanTree->qqq);
+	huffmanTree->qqq = NULL;
+	for(i=0;i<huffmanTree->stateNum;i++)
+	{
+		if(huffmanTree->code[i]!=NULL)
+			free(huffmanTree->code[i]);
+	}
+	free(huffmanTree->code);
+	huffmanTree->code = NULL;
+	free(huffmanTree->cout);
+	huffmanTree->cout = NULL;	
+	free(huffmanTree);
+	huffmanTree = NULL;
+}
--- a/deps/SZ/sz/src/MultiLevelCacheTable.c
+++ b/deps/SZ/sz/src/MultiLevelCacheTable.c
@ -0,0 +1,193 @@
+/**
+ *  @file MultiLevelCacheTable.c
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdint.h>
+#include <memory.h>
+#include <stdlib.h>
+#include "stdio.h"
+#include "MultiLevelCacheTable.h"
+
+uint8_t MLCT_GetExpoIndex(float value){
+    uint32_t* ptr = (uint32_t*)&value;
+    return (*ptr) >> 23;
+}
+
+uint8_t MLCT_GetRequiredBits(float precision){
+    int32_t* ptr = (int32_t*)&precision;
+    return -(((*ptr) >> 23) - 127);
+}
+
+
+uint32_t MLCT_GetMantiIndex(float value, int bits){
+    uint32_t* ptr = (uint32_t*)&value;
+    (*ptr) = (*ptr) << 9 >> 9;
+    int shift = 32 - 9 - bits;
+    if(shift > 0){
+        return (*ptr) >> shift;
+    }else{
+        return (*ptr);
+    }
+}
+
+float MLTC_RebuildFloat(uint8_t expo, uint32_t manti, int bits){
+    float result = 0;
+    uint32_t *ptr = (uint32_t*)&result;
+    *ptr = expo;
+    (*ptr) = (*ptr) << 23;
+    (*ptr) |= (manti << (23-bits));
+    return result;
+}
+
+void MultiLevelCacheTableBuild(struct TopLevelTable* topTable, float* precisionTable, int count, float precision){
+    uint8_t bits = MLCT_GetRequiredBits(precision);
+    topTable->bits = bits;
+    topTable->bottomBoundary = precisionTable[1]/(1+precision);
+    topTable->topBoundary = precisionTable[count-1]/(1-precision);
+    topTable->baseIndex = MLCT_GetExpoIndex(topTable->bottomBoundary);
+    topTable->topIndex = MLCT_GetExpoIndex(topTable->topBoundary);
+    int subTableCount = topTable->topIndex - topTable->baseIndex + 1;
+    topTable->subTables = (struct SubLevelTable*)malloc(sizeof(struct SubLevelTable) * subTableCount);
+    memset(topTable->subTables, 0, sizeof(struct SubLevelTable) * subTableCount);
+
+    //uint32_t expoBoundary[subTableCount];
+    uint8_t lastExpo = 0xff;
+    uint8_t lastIndex = 0;
+    for(int i=0; i<count; i++){
+        uint8_t expo = MLCT_GetExpoIndex(precisionTable[i]);
+        if(expo != lastExpo){
+            //expoBoundary[lastIndex] = i;
+            lastExpo = expo;
+            lastIndex++;
+        }
+    }
+
+    for(int i=topTable->topIndex-topTable->baseIndex; i>=0; i--){
+        struct SubLevelTable* processingSubTable = &topTable->subTables[i];
+        if(i == topTable->topIndex - topTable->baseIndex &&
+            MLCT_GetExpoIndex(topTable->topBoundary) == MLCT_GetExpoIndex(precisionTable[count-1])){
+            processingSubTable->topIndex = MLCT_GetMantiIndex(topTable->topBoundary, bits) - 1;
+        }else{
+            uint32_t maxIndex = 0;
+            for(int j=0; j<bits; j++){
+                maxIndex += 1 << j;
+            }
+            processingSubTable->topIndex = maxIndex;
+        }
+        if(i == 0 && MLCT_GetExpoIndex(topTable->bottomBoundary) == MLCT_GetExpoIndex(precisionTable[0])){
+            processingSubTable->baseIndex = MLCT_GetMantiIndex(topTable->bottomBoundary, bits)+1;
+        }else{
+            processingSubTable->baseIndex = 0;
+        }
+
+        int subTableLength = processingSubTable->topIndex - processingSubTable-> baseIndex+ 1;
+        processingSubTable->table = (uint32_t*)malloc(sizeof(uint32_t) * subTableLength);
+        memset(processingSubTable->table, 0, sizeof(uint32_t) * subTableLength);
+        processingSubTable->expoIndex = topTable->baseIndex + i;
+    }
+
+    uint32_t index = 1;
+    for(uint8_t i = 0; i<=topTable->topIndex-topTable->baseIndex; i++){
+        struct SubLevelTable* processingSubTable = &topTable->subTables[i];
+        uint8_t expoIndex = i+topTable->baseIndex;
+        for(uint32_t j = 0; j<=processingSubTable->topIndex - processingSubTable->baseIndex; j++){
+            uint32_t mantiIndex = j+processingSubTable->baseIndex;
+            float sample = MLTC_RebuildFloat(expoIndex, mantiIndex, topTable->bits);
+            float bottomBoundary = precisionTable[index] / (1+precision);
+            float topBoundary = precisionTable[index] / (1-precision);
+            if(sample < topBoundary && sample > bottomBoundary){
+                processingSubTable->table[j] = index;
+            }else{
+                //float newPrecision = precisionTable[index];
+                index++;
+                processingSubTable->table[j] = index;
+                if(j)
+                    processingSubTable->table[j-1] = index;
+                else{
+                    struct SubLevelTable* pastSubTable = &topTable->subTables[i-1];
+                    pastSubTable->table[pastSubTable->topIndex - pastSubTable->baseIndex] = index;
+                }
+            }
+        }
+        if(i == topTable->topIndex - topTable->baseIndex){
+            uint32_t j = processingSubTable->topIndex - processingSubTable->baseIndex + 1;
+            uint32_t mantiIndex = j + processingSubTable->baseIndex;
+            float sample = MLTC_RebuildFloat(expoIndex, mantiIndex, topTable->bits);
+            float bottomBoundary = precisionTable[index] / (1+precision);
+            float topBoundary = precisionTable[index] / (1-precision);
+            if(sample > topBoundary || sample < bottomBoundary){
+                index++;
+                processingSubTable->table[j-1] = index;
+            }
+        }
+    }
+
+    /*
+    long lastIndexInExpoRange = count-1;
+    bool trigger = false;
+    float preRange = 0.0;
+    uint32_t preIndex = 0;
+    for(int i=topTable->topIndex-topTable->baseIndex; i>=0; i--){
+        struct SubLevelTable* processingSubTable = &topTable->subTables[i];
+        if(trigger){
+            uint32_t bound = MLCT_GetMantiIndex(preRange, bits);
+            for(int j = processingSubTable->topIndex; j>=processingSubTable->baseIndex; j--){
+                if(j >= bound){
+                    processingSubTable->table[j-processingSubTable->baseIndex] = preIndex;
+                }else{
+                    break;
+                }
+            }
+            trigger = false;
+        }
+        long firstIndexInExpoRange = expoBoundary[i];
+        uint8_t expoInRange = MLCT_GetExpoIndex(precisionTable[firstIndexInExpoRange]);
+        for(int j=lastIndexInExpoRange; j>=firstIndexInExpoRange; j--){
+            float test = precisionTable[j];
+            uint32_t rangeTop = MLCT_GetMantiIndex(precisionTable[j]*(1+precision), bits) - 1;
+            uint32_t rangeBottom;
+            if(j == firstIndexInExpoRange){
+                preRange = precisionTable[j]/(1+precision);
+                if(expoInRange != MLCT_GetExpoIndex(preRange)){
+                    trigger = true;
+                    preIndex = firstIndexInExpoRange;
+                    rangeBottom = 0;
+                }else{
+                    rangeBottom= MLCT_GetMantiIndex(precisionTable[j]/(1+precision), bits) + 1;
+                }
+            }else{
+                rangeBottom= MLCT_GetMantiIndex(precisionTable[j]/(1+precision), bits) + 1;
+            }
+            for(int k = rangeBottom; k<=rangeTop; k++){
+                if( k <= processingSubTable->topIndex && k >= processingSubTable->baseIndex)
+                    processingSubTable->table[k - processingSubTable->baseIndex] = j;
+            }
+        }
+        lastIndexInExpoRange = firstIndexInExpoRange-1;
+    }
+     */
+}
+
+uint32_t MultiLevelCacheTableGetIndex(float value, struct TopLevelTable* topLevelTable){
+    uint8_t expoIndex = MLCT_GetExpoIndex(value);
+    if(expoIndex <= topLevelTable->topIndex && expoIndex >= topLevelTable->baseIndex){
+        struct SubLevelTable* subLevelTable = &topLevelTable->subTables[expoIndex-topLevelTable->baseIndex];
+        uint32_t mantiIndex = MLCT_GetMantiIndex(value, topLevelTable->bits);
+        MLTC_RebuildFloat(expoIndex, mantiIndex, topLevelTable->bits);
+        if(mantiIndex >= subLevelTable->baseIndex && mantiIndex <= subLevelTable->topIndex)
+            return subLevelTable->table[mantiIndex - subLevelTable->baseIndex];
+    }
+    return 0;
+}
+
+void MultiLevelCacheTableFree(struct TopLevelTable* table){
+    for(int i=0; i<table->topIndex - table->baseIndex + 1; i++){
+        free(table->subTables[i].table);
+    }
+    free(table->subTables);
+}
--- a/deps/SZ/sz/src/MultiLevelCacheTableWideInterval.c
+++ b/deps/SZ/sz/src/MultiLevelCacheTableWideInterval.c
@ -0,0 +1,125 @@
+/**
+ *  @file MultiLevelCacheTableWideInterval.h
+ *  @author Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang, Sheng Di, Dingwen Tao
+ *  @date Jan, 2019
+ *  @brief Header file.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdbool.h>
+#include "MultiLevelCacheTableWideInterval.h"
+
+void freeTopLevelTableWideInterval(struct TopLevelTableWideInterval* topTable)
+{
+	for(int i=topTable->topIndex-topTable->baseIndex; i>=0; i--)
+	{
+		struct SubLevelTableWideInterval* processingSubTable = &topTable->subTables[i];
+		free(processingSubTable->table);
+	}
+	free(topTable->subTables);
+}
+
+uint16_t MLCTWI_GetExpoIndex(double value){
+    uint64_t* ptr = (uint64_t*)&value;
+    return (*ptr) >> 52;
+}
+
+uint16_t MLCTWI_GetRequiredBits(double precision){
+    uint64_t* ptr = (uint64_t*)&precision;
+    return -(((*ptr) >> 52) - 1023);
+}
+
+uint64_t MLCTWI_GetMantiIndex(double value, int bits){
+    uint64_t* ptr = (uint64_t*)&value;
+    (*ptr) = (*ptr) << 12 >> 12;
+    int shift = 64 - 12 - bits;
+    if(shift > 0){
+        return (*ptr) >> shift;
+    }else{
+        return (*ptr);
+    }
+}
+
+double MLTCWI_RebuildDouble(uint16_t expo, uint64_t manti, int bits){
+    double result = 0;
+    uint64_t *ptr = (uint64_t*)&result;
+    *ptr = expo;
+    (*ptr) = (*ptr) << 52;
+    (*ptr) += (manti << (52-bits));
+    return result;
+}
+
+void MultiLevelCacheTableWideIntervalBuild(struct TopLevelTableWideInterval* topTable, double* precisionTable, int count, double precision, int plus_bits){
+    uint16_t bits = MLCTWI_GetRequiredBits(precision) + plus_bits;
+    topTable->bits = bits;
+    topTable->bottomBoundary = precisionTable[1]/(1+precision);
+    topTable->topBoundary = precisionTable[count-1]/(1-precision);
+    topTable->baseIndex = MLCTWI_GetExpoIndex(topTable->bottomBoundary);
+    topTable->topIndex = MLCTWI_GetExpoIndex(topTable->topBoundary);
+    int subTableCount = topTable->topIndex - topTable->baseIndex + 1;
+    topTable->subTables = (struct SubLevelTableWideInterval*)malloc(sizeof(struct SubLevelTableWideInterval) * subTableCount);
+    memset(topTable->subTables, 0, sizeof(struct SubLevelTableWideInterval) * subTableCount);
+
+    for(int i=topTable->topIndex-topTable->baseIndex; i>=0; i--){
+        struct SubLevelTableWideInterval* processingSubTable = &topTable->subTables[i];
+
+        uint32_t maxIndex = 0;
+        for(int j=0; j<bits; j++){
+            maxIndex += 1 << j;
+        }
+        processingSubTable->topIndex = maxIndex;
+        processingSubTable->baseIndex = 0;
+
+        uint64_t subTableLength = processingSubTable->topIndex - processingSubTable-> baseIndex+ 1;
+        processingSubTable->table = (uint16_t*)malloc(sizeof(uint16_t) * subTableLength);
+        memset(processingSubTable->table, 0, sizeof(uint16_t) * subTableLength);
+        processingSubTable->expoIndex = topTable->baseIndex + i;
+    }
+
+
+    uint32_t index = 0;
+    bool flag = false;
+    for(uint16_t i = 0; i<=topTable->topIndex-topTable->baseIndex; i++){
+        struct SubLevelTableWideInterval* processingSubTable = &topTable->subTables[i];
+        uint16_t expoIndex = i+topTable->baseIndex;
+        for(uint32_t j = 0; j<=processingSubTable->topIndex - processingSubTable->baseIndex; j++){
+            uint64_t mantiIndex = j + processingSubTable->baseIndex;
+            double sampleBottom = MLTCWI_RebuildDouble(expoIndex, mantiIndex, topTable->bits);
+            double sampleTop = MLTCWI_RebuildDouble(expoIndex, mantiIndex+1, topTable->bits);
+            double bottomBoundary = precisionTable[index] / (1+precision);
+            double topBoundary = precisionTable[index] / (1-precision);
+            if(sampleTop < topBoundary && sampleBottom > bottomBoundary){
+                processingSubTable->table[j] = index;
+                flag = true;
+            }else{
+                if(flag && index < count-1){
+                    index++;
+                    processingSubTable->table[j] = index;
+                }else{
+                    processingSubTable->table[j] = 0;
+                }
+            }
+        }
+    }
+
+}
+
+uint32_t MultiLevelCacheTableWideIntervalGetIndex(double value, struct TopLevelTableWideInterval* topLevelTable){
+    uint16_t expoIndex = MLCTWI_GetExpoIndex(value);
+    if(expoIndex <= topLevelTable->topIndex && expoIndex >= topLevelTable->baseIndex){
+        struct SubLevelTableWideInterval* subLevelTable = &topLevelTable->subTables[expoIndex-topLevelTable->baseIndex];
+        uint64_t mantiIndex = MLCTWI_GetMantiIndex(value, topLevelTable->bits);
+        return subLevelTable->table[mantiIndex - subLevelTable->baseIndex];
+
+    }
+    return 0;
+}
+
+void MultiLevelCacheTableWideIntervalFree(struct TopLevelTableWideInterval* table){
+    for(int i=0; i<table->topIndex - table->baseIndex + 1; i++){
+        free(table->subTables[i].table);
+    }
+    free(table->subTables);
+}
+
--- a/deps/SZ/sz/src/TightDataPointStorageD.c
+++ b/deps/SZ/sz/src/TightDataPointStorageD.c
@ -0,0 +1,751 @@
+/**
+ *  @file TightPointDataStorageD.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief The functions used to construct the tightPointDataStorage element for storing compressed bytes.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "TightDataPointStorageD.h"
+#include "sz.h"
+#include "Huffman.h"
+//#include "rw.h"
+
+void new_TightDataPointStorageD_Empty(TightDataPointStorageD **this)
+{
+	*this = (TightDataPointStorageD*)malloc(sizeof(TightDataPointStorageD));
+	(*this)->dataSeriesLength = 0;
+	(*this)->allSameData = 0;
+	(*this)->exactDataNum = 0;
+	(*this)->reservedValue = 0;
+	(*this)->reqLength = 0;
+	(*this)->radExpo = 0;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	(*this)->typeArray = NULL; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	(*this)->typeArray_size = 0;
+
+	(*this)->leadNumArray = NULL; //its size is exactDataNum/4 (or exactDataNum/4+1)
+	(*this)->leadNumArray_size = 0;
+
+	(*this)->exactMidBytes = NULL;
+	(*this)->exactMidBytes_size = 0;
+
+	(*this)->residualMidBits = NULL;
+	(*this)->residualMidBits_size = 0;
+	
+	(*this)->intervals = 0;
+	(*this)->isLossless = 0;
+	
+	(*this)->segment_size = 0;
+	(*this)->pwrErrBoundBytes = NULL;
+	(*this)->pwrErrBoundBytes_size = 0;
+	
+	(*this)->raBytes = NULL;
+	(*this)->raBytes_size = 0;
+
+}
+
+int new_TightDataPointStorageD_fromFlatBytes(TightDataPointStorageD **this, unsigned char* flatBytes, size_t flatBytesLength)
+{
+	new_TightDataPointStorageD_Empty(this);
+	size_t i, index = 0;
+	size_t pwrErrBoundBytes_size = 0, segmentL = 0, radExpoL = 0, pwrErrBoundBytesL = 0;
+	char version[3];
+	for (i = 0; i < 3; i++)
+		version[i] = flatBytes[index++]; //3
+	unsigned char sameRByte = flatBytes[index++]; //1
+	if(checkVersion2(version)!=1)
+	{
+		//wrong version
+		printf("Wrong version: \nCompressed-data version (%d.%d.%d)\n",version[0], version[1], version[2]);
+		printf("Current sz version: (%d.%d.%d)\n", versionNumber[0], versionNumber[1], versionNumber[2]);
+		printf("Please double-check if the compressed data (or file) is correct.\n");
+		exit(0);
+	}
+
+	int same = sameRByte & 0x01;
+	//confparams_dec->szMode = (sameRByte & 0x06)>>1;
+	(*this)->isLossless = (sameRByte & 0x10)>>4;
+	int isPW_REL = (sameRByte & 0x20)>>5;
+	exe_params->SZ_SIZE_TYPE = ((sameRByte & 0x40)>>6)==1?8:4;
+	//confparams_dec->randomAccess = (sameRByte & 0x02) >> 1;
+	//confparams_dec->szMode = (sameRByte & 0x06) >> 1;						//this 0000,0110	are not used for szMode any more
+	confparams_dec->protectValueRange = (sameRByte & 0x04)>>2;
+	confparams_dec->accelerate_pw_rel_compression = (sameRByte & 0x08) >> 3;
+	int errorBoundMode = ABS;
+	if(isPW_REL)
+	{
+		errorBoundMode = PW_REL;
+		segmentL = exe_params->SZ_SIZE_TYPE;
+		pwrErrBoundBytesL = 4;
+	}
+	
+	if(confparams_dec==NULL)
+	{
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+		memset(confparams_dec, 0, sizeof(sz_params));
+	}	
+	convertBytesToSZParams(&(flatBytes[index]), confparams_dec);
+
+	index += MetaDataByteLength_double;
+
+	int isRegression = (sameRByte >> 7) & 0x01;
+
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		dsLengthBytes[i] = flatBytes[index++];
+	(*this)->dataSeriesLength = bytesToSize(dsLengthBytes);
+
+	//printf("confparams_dec->szMode=%d\n",confparams_dec->szMode);
+
+	if((*this)->isLossless==1)
+	{
+		//(*this)->exactMidBytes = flatBytes+8;
+		return errorBoundMode;
+	}
+	else if(same==1)
+	{
+		(*this)->allSameData = 1;
+		//size_t exactMidBytesLength = sizeof(double);//flatBytesLength - 3 - 1 - MetaDataByteLength_double -exe_params->SZ_SIZE_TYPE;
+		(*this)->exactMidBytes = &(flatBytes[index]);
+		return errorBoundMode;
+	}
+	else
+		(*this)->allSameData = 0;
+		
+	if(isRegression == 1)
+	{
+		(*this)->raBytes_size = flatBytesLength - 3 - 1 - MetaDataByteLength_double - exe_params->SZ_SIZE_TYPE;
+		(*this)->raBytes = &(flatBytes[index]);
+		return errorBoundMode;
+	}					
+		
+	int rtype_ = 0;//sameRByte & 0x08; //1000		
+
+	unsigned char byteBuf[8];
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	int max_quant_intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	confparams_dec->maxRangeRadius = max_quant_intervals/2;
+
+	if(errorBoundMode>=PW_REL)
+	{
+		(*this)->radExpo = flatBytes[index++];//1
+		radExpoL = 1;
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			byteBuf[i] = flatBytes[index++];
+		confparams_dec->segment_size = (*this)->segment_size = bytesToSize(byteBuf);// exe_params->SZ_SIZE_TYPE	
+
+		for (i = 0; i < 4; i++)
+			byteBuf[i] = flatBytes[index++];
+		pwrErrBoundBytes_size = (*this)->pwrErrBoundBytes_size = bytesToInt_bigEndian(byteBuf);// 4		
+	}
+	else
+	{
+		pwrErrBoundBytes_size = 0;
+		(*this)->pwrErrBoundBytes = NULL;
+	}
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->medianValue = bytesToDouble(byteBuf);//8
+
+	(*this)->reqLength = flatBytes[index++]; //1
+	
+	if(isPW_REL && confparams_dec->accelerate_pw_rel_compression)
+	{
+		(*this)->plus_bits = flatBytes[index++];
+		(*this)->max_bits = flatBytes[index++];
+	}
+	
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->realPrecision = bytesToDouble(byteBuf);//8
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->typeArray_size = bytesToSize(byteBuf);// exe_params->SZ_SIZE_TYPE	
+
+	if(rtype_!=0)
+	{
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++) 
+			byteBuf[i] = flatBytes[index++];
+		(*this)->rtypeArray_size = bytesToSize(byteBuf);//ST		
+	}
+	else
+		(*this)->rtypeArray_size = 0;
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactDataNum = bytesToSize(byteBuf);// ST
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactMidBytes_size = bytesToSize(byteBuf);// ST
+
+	if (rtype_ != 0) {
+		if((*this)->rtypeArray_size>0)
+			(*this)->rtypeArray = (unsigned char*)malloc(sizeof(unsigned char)*(*this)->rtypeArray_size);
+		else
+			(*this)->rtypeArray = NULL;
+
+		for (i = 0; i < 8; i++)
+			byteBuf[i] = flatBytes[index++];
+		(*this)->reservedValue = bytesToDouble(byteBuf);//8
+	}
+
+	size_t logicLeadNumBitsNum = (*this)->exactDataNum * 2;
+	if (logicLeadNumBitsNum % 8 == 0)
+	{
+		(*this)->leadNumArray_size = logicLeadNumBitsNum >> 3;
+	}
+	else
+	{
+		(*this)->leadNumArray_size = (logicLeadNumBitsNum >> 3) + 1;
+	}
+	
+	int minLogValueSize = 0;
+	if(errorBoundMode>=PW_REL)
+		minLogValueSize = 8;
+
+	if ((*this)->rtypeArray != NULL) 
+	{
+		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength_double - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 8 - 1 - 8 
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - minLogValueSize - exe_params->SZ_SIZE_TYPE - 8 - (*this)->rtypeArray_size 
+				- minLogValueSize - (*this)->typeArray_size - (*this)->leadNumArray_size
+				- (*this)->exactMidBytes_size - pwrErrBoundBytes_size - 1 - 1;
+		for (i = 0; i < (*this)->rtypeArray_size; i++)
+			(*this)->rtypeArray[i] = flatBytes[index++];
+	}
+	else
+	{
+		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength_double - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 8 - 1 - 8
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - minLogValueSize - (*this)->typeArray_size
+				- (*this)->leadNumArray_size - (*this)->exactMidBytes_size - pwrErrBoundBytes_size - 1 - 1;
+	}	
+
+	if(errorBoundMode >= PW_REL){
+		(*this)->minLogValue = bytesToDouble(&flatBytes[index]);
+		index+=8;
+	}
+
+	(*this)->typeArray = &flatBytes[index];
+	//retrieve the number of states (i.e., stateNum)
+	(*this)->allNodes = bytesToInt_bigEndian((*this)->typeArray); //the first 4 bytes store the stateNum
+	(*this)->stateNum = ((*this)->allNodes+1)/2;	
+
+	index+=(*this)->typeArray_size;
+	
+	(*this)->pwrErrBoundBytes = &flatBytes[index];
+	
+	index+=pwrErrBoundBytes_size;
+	
+	(*this)->leadNumArray = &flatBytes[index];
+	
+	index+=(*this)->leadNumArray_size;
+	
+	(*this)->exactMidBytes = &flatBytes[index];
+	
+	index+=(*this)->exactMidBytes_size;
+	
+	(*this)->residualMidBits = &flatBytes[index];
+	
+	//index+=(*this)->residualMidBits_size;
+	
+	return errorBoundMode;
+}
+
+/**
+ * 
+ * type's length == dataSeriesLength
+ * exactMidBytes's length == exactMidBytes_size
+ * leadNumIntArray's length == exactDataNum
+ * escBytes's length == escBytes_size
+ * resiBitLength's length == resiBitLengthSize
+ * */
+void new_TightDataPointStorageD(TightDataPointStorageD **this, 
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char resiBitLength, 
+		double realPrecision, double medianValue, char reqLength, unsigned int intervals,
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo) {
+	//int i = 0;
+	*this = (TightDataPointStorageD *)malloc(sizeof(TightDataPointStorageD));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->medianValue = medianValue;
+	(*this)->reqLength = reqLength;
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+		(*this)->max_bits = encode_withTree_MSST19(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	else
+		encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+		
+	(*this)->exactMidBytes = exactMidBytes;
+	(*this)->exactMidBytes_size = exactMidBytes_size;
+
+	(*this)->leadNumArray_size = convertIntArray2ByteArray_fast_2b(leadNumIntArray, exactDataNum, &((*this)->leadNumArray));
+
+	(*this)->residualMidBits_size = convertIntArray2ByteArray_fast_dynamic(resiMidBits, resiBitLength, exactDataNum, &((*this)->residualMidBits));
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		(*this)->pwrErrBoundBytes = pwrErrBoundBytes;
+	else
+		(*this)->pwrErrBoundBytes = NULL;
+		
+	(*this)->radExpo = radExpo;
+	
+	(*this)->pwrErrBoundBytes_size = pwrErrBoundBytes_size;
+}
+
+void new_TightDataPointStorageD2(TightDataPointStorageD **this, 
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char* resiBitLength, size_t resiBitLengthSize,
+		double realPrecision, double medianValue, char reqLength, unsigned int intervals,
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo) {
+	//int i = 0;
+	*this = (TightDataPointStorageD *)malloc(sizeof(TightDataPointStorageD));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->medianValue = medianValue;
+	(*this)->reqLength = reqLength;
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	(*this)->exactMidBytes = exactMidBytes;
+	(*this)->exactMidBytes_size = exactMidBytes_size;
+
+	(*this)->leadNumArray_size = convertIntArray2ByteArray_fast_2b(leadNumIntArray, exactDataNum, &((*this)->leadNumArray));
+
+	//(*this)->residualMidBits = resiMidBits;
+	//(*this)->residualMidBits_size = resiMidBits_size;
+
+	(*this)->residualMidBits_size = convertIntArray2ByteArray_fast_dynamic2(resiMidBits, resiBitLength, resiBitLengthSize, &((*this)->residualMidBits));
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		(*this)->pwrErrBoundBytes = pwrErrBoundBytes;
+	else
+		(*this)->pwrErrBoundBytes = NULL;
+		
+	(*this)->radExpo = radExpo;
+	
+	(*this)->pwrErrBoundBytes_size = pwrErrBoundBytes_size;
+}
+
+void convertTDPStoBytes_double(TightDataPointStorageD* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	unsigned char intervalsBytes[4];
+	unsigned char typeArrayLengthBytes[8];
+	unsigned char exactLengthBytes[8];
+	unsigned char exactMidBytesLength[8];
+	unsigned char realPrecisionBytes[8];
+	
+	unsigned char medianValueBytes[8];
+	
+	unsigned char segment_sizeBytes[8];
+	unsigned char pwrErrBoundBytes_sizeBytes[4];
+	unsigned char max_quant_intervals_Bytes[4];
+	
+	for(i = 0;i<3;i++)//3 bytes
+		bytes[k++] = versionNumber[i];
+	bytes[k++] = sameByte;	//1	byte	
+	
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength_double;
+	
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST: 4 or 8 bytes
+		bytes[k++] = dsLengthBytes[i];	
+	intToBytes_bigEndian(max_quant_intervals_Bytes, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = max_quant_intervals_Bytes[i];		
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		bytes[k++] = tdps->radExpo; //1 byte			
+		
+		sizeToBytes(segment_sizeBytes, confparams_cpr->segment_size);
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+			bytes[k++] = segment_sizeBytes[i];				
+			
+		intToBytes_bigEndian(pwrErrBoundBytes_sizeBytes, tdps->pwrErrBoundBytes_size);
+		for(i = 0;i<4;i++)//4
+			bytes[k++] = pwrErrBoundBytes_sizeBytes[i];					
+	}
+	
+	intToBytes_bigEndian(intervalsBytes, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = intervalsBytes[i];		
+	
+	doubleToBytes(medianValueBytes, tdps->medianValue);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = medianValueBytes[i];		
+
+	bytes[k++] = tdps->reqLength; //1 byte
+
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression==1)
+	{
+		bytes[k++] = tdps->plus_bits;
+		bytes[k++] = tdps->max_bits;
+	}
+
+	doubleToBytes(realPrecisionBytes, tdps->realPrecision);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = realPrecisionBytes[i];
+			
+	sizeToBytes(typeArrayLengthBytes, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = typeArrayLengthBytes[i];				
+				
+	sizeToBytes(exactLengthBytes, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactLengthBytes[i];
+
+	sizeToBytes(exactMidBytesLength, tdps->exactMidBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactMidBytesLength[i];
+
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		doubleToBytes(exactMidBytesLength, tdps->minLogValue);
+		for(i = 0;i < 8; i++)
+			bytes[k++] = exactMidBytesLength[i];
+	}
+
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		memcpy(&(bytes[k]), tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size);
+		k += tdps->pwrErrBoundBytes_size;
+	}
+
+	memcpy(&(bytes[k]), tdps->leadNumArray, tdps->leadNumArray_size);
+	k += tdps->leadNumArray_size;
+	memcpy(&(bytes[k]), tdps->exactMidBytes, tdps->exactMidBytes_size);
+	k += tdps->exactMidBytes_size;
+
+	if(tdps->residualMidBits!=NULL)
+	{
+		memcpy(&(bytes[k]), tdps->residualMidBits, tdps->residualMidBits_size);
+		k += tdps->residualMidBits_size;
+	}		
+}
+
+void convertTDPStoBytes_double_reserve(TightDataPointStorageD* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	unsigned char intervalsBytes[4];
+	unsigned char typeArrayLengthBytes[8];
+	unsigned char rTypeLengthBytes[8];
+	unsigned char exactLengthBytes[8];
+	unsigned char exactMidBytesLength[8];
+	unsigned char reservedValueBytes[8];
+	unsigned char realPrecisionBytes[8];
+	
+	unsigned char medianValueBytes[8];
+	
+	unsigned char segment_sizeBytes[8];
+	unsigned char pwrErrBoundBytes_sizeBytes[4];
+	unsigned char max_quant_intervals_Bytes[4];	
+	
+	for(i = 0;i<3;i++)//3
+		bytes[k++] = versionNumber[i];		
+	bytes[k++] = sameByte;			//1
+
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength_double;
+	
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = dsLengthBytes[i];		
+
+	intToBytes_bigEndian(max_quant_intervals_Bytes, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = max_quant_intervals_Bytes[i];
+
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		bytes[k++] = tdps->radExpo; //1 byte			
+		
+		sizeToBytes(segment_sizeBytes, confparams_cpr->segment_size);
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//4
+			bytes[k++] = segment_sizeBytes[i];				
+			
+		intToBytes_bigEndian(pwrErrBoundBytes_sizeBytes, tdps->pwrErrBoundBytes_size);
+		for(i = 0;i<4;i++)//4
+			bytes[k++] = pwrErrBoundBytes_sizeBytes[i];					
+	}
+	intToBytes_bigEndian(intervalsBytes, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = intervalsBytes[i];	
+
+	doubleToBytes(medianValueBytes, tdps->medianValue);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = medianValueBytes[i];		
+
+	bytes[k++] = tdps->reqLength; //1 byte
+
+	doubleToBytes(realPrecisionBytes, tdps->realPrecision);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = realPrecisionBytes[i];		
+	
+	sizeToBytes(typeArrayLengthBytes, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = typeArrayLengthBytes[i];			
+	
+	sizeToBytes(rTypeLengthBytes, tdps->rtypeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = rTypeLengthBytes[i];	
+	
+	sizeToBytes(exactLengthBytes, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactLengthBytes[i];
+
+	sizeToBytes(exactMidBytesLength, tdps->exactMidBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactMidBytesLength[i];
+
+	doubleToBytes(reservedValueBytes, tdps->reservedValue);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = reservedValueBytes[i];
+	
+	memcpy(&(bytes[k]), tdps->rtypeArray, tdps->rtypeArray_size);
+	k += tdps->rtypeArray_size;		
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		doubleToBytes(exactMidBytesLength, tdps->minLogValue);
+		for(i = 0;i < 8; i++)
+			bytes[k++] = exactMidBytesLength[i];
+	}
+	
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		memcpy(&(bytes[k]), tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size);
+		k += tdps->pwrErrBoundBytes_size;
+	}
+	memcpy(&(bytes[k]), tdps->leadNumArray, tdps->leadNumArray_size);
+	k += tdps->leadNumArray_size;
+	memcpy(&(bytes[k]), tdps->exactMidBytes, tdps->exactMidBytes_size);
+	k += tdps->exactMidBytes_size;		
+	if(tdps->residualMidBits!=NULL)
+	{
+		memcpy(&(bytes[k]), tdps->residualMidBits, tdps->residualMidBits_size);
+		k += tdps->residualMidBits_size;	
+	}	
+}
+
+//Convert TightDataPointStorageD to bytes...
+void convertTDPStoFlatBytes_double(TightDataPointStorageD *tdps, unsigned char** bytes, size_t *size) 
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+	
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	//sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		sameByte = (unsigned char) (sameByte | 0x20); // 00100000, the 5th bit
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+		sameByte = (unsigned char) (sameByte | 0x08); 	
+	
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + tdps->exactMidBytes_size;
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+	
+		for (i = 0; i < 3; i++)//3
+			(*bytes)[k++] = versionNumber[i];
+		(*bytes)[k++] = sameByte;
+
+		convertSZParamsToBytes(confparams_cpr, &((*bytes)[k]));
+		k = k + MetaDataByteLength_double;
+
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			(*bytes)[k++] = dsLengthBytes[i];
+		
+		for (i = 0; i < tdps->exactMidBytes_size; i++)
+			(*bytes)[k++] = tdps->exactMidBytes[i];
+		
+		*size = totalByteLength;
+	}
+	else if (tdps->rtypeArray == NULL) 
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+
+		int minLogValueSize = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+			minLogValueSize = 8;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 8 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE 
+				+ minLogValueSize /*max absolute log value*/
+				+ tdps->typeArray_size + tdps->leadNumArray_size
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+		if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+			totalByteLength += (1+1); // for MSST19
+			
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		convertTDPStoBytes_double(tdps, *bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+	else //the case with reserved value
+	{
+		//TODO
+	}
+}
+
+void convertTDPStoFlatBytes_double_args(TightDataPointStorageD *tdps, unsigned char* bytes, size_t *size) 
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+		
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0; //0000,0001
+	sameByte = sameByte | (confparams_cpr->szMode << 1); //0000,0110
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10); // 0001,0000
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		sameByte = (unsigned char) (sameByte | 0x20); // 0010,0000, the 5th bit
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); //0100,0000, the 6th bit
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+		sameByte = (unsigned char) (sameByte | 0x08); //0000,1000, the 7th bit 	
+	if(confparams_cpr->protectValueRange)
+		sameByte = (unsigned char) (sameByte | 0x04); //0000,0100
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE + tdps->exactMidBytes_size;
+	
+		for (i = 0; i < 3; i++)//3
+			bytes[k++] = versionNumber[i];
+		bytes[k++] = sameByte;
+		
+		convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+		k = k + MetaDataByteLength_double;
+				
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			bytes[k++] = dsLengthBytes[i];		
+		for (i = 0; i < tdps->exactMidBytes_size; i++)
+			bytes[k++] = tdps->exactMidBytes[i];
+		
+		*size = totalByteLength;
+	}
+	else if (tdps->rtypeArray == NULL) 
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength_double + exe_params->SZ_SIZE_TYPE+ 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 8 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE 
+				+ tdps->typeArray_size + tdps->leadNumArray_size
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+		if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+			totalByteLength += (1+1); // for MSST19
+		convertTDPStoBytes_double(tdps, bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+	else //the case with reserved value
+	{
+		//TODO
+	}
+}
+
+
+void free_TightDataPointStorageD(TightDataPointStorageD *tdps)
+{
+	if(tdps->rtypeArray!=NULL)
+		free(tdps->rtypeArray);
+	if(tdps->typeArray!=NULL)
+		free(tdps->typeArray);
+	if(tdps->leadNumArray!=NULL)
+		free(tdps->leadNumArray);
+	if(tdps->exactMidBytes!=NULL)
+		free(tdps->exactMidBytes);
+	if(tdps->residualMidBits!=NULL)
+		free(tdps->residualMidBits);
+	if(tdps->pwrErrBoundBytes!=NULL) 	
+		free(tdps->pwrErrBoundBytes);
+	free(tdps);
+}
+
+/**
+ * to free the memory used in the decompression
+ * */
+void free_TightDataPointStorageD2(TightDataPointStorageD *tdps)
+{			
+	free(tdps);
+}
--- a/deps/SZ/sz/src/TightDataPointStorageF.c
+++ b/deps/SZ/sz/src/TightDataPointStorageF.c
@ -0,0 +1,754 @@
+/**
+ *  @file TightPointDataStorageF.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief The functions used to construct the tightPointDataStorage element for storing compressed bytes.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "TightDataPointStorageF.h"
+#include "sz.h"
+#include "Huffman.h"
+//#include "rw.h"
+
+void new_TightDataPointStorageF_Empty(TightDataPointStorageF **this)
+{
+	*this = (TightDataPointStorageF*)malloc(sizeof(TightDataPointStorageF));
+	(*this)->dataSeriesLength = 0;
+	(*this)->allSameData = 0;
+	(*this)->exactDataNum = 0;
+	(*this)->reservedValue = 0;
+	(*this)->reqLength = 0;
+	(*this)->radExpo = 0;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	(*this)->typeArray = NULL; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	(*this)->typeArray_size = 0;
+
+	(*this)->leadNumArray = NULL; //its size is exactDataNum/4 (or exactDataNum/4+1)
+	(*this)->leadNumArray_size = 0;
+
+	(*this)->exactMidBytes = NULL;
+	(*this)->exactMidBytes_size = 0;
+
+	(*this)->residualMidBits = NULL;
+	(*this)->residualMidBits_size = 0;
+	
+	(*this)->intervals = 0;
+	(*this)->isLossless = 0;
+	
+	(*this)->segment_size = 0;
+	(*this)->pwrErrBoundBytes = NULL;
+	(*this)->pwrErrBoundBytes_size = 0;	
+	
+	(*this)->raBytes = NULL;
+	(*this)->raBytes_size = 0;
+}
+
+int new_TightDataPointStorageF_fromFlatBytes(TightDataPointStorageF **this, unsigned char* flatBytes, size_t flatBytesLength)
+{
+	new_TightDataPointStorageF_Empty(this);
+	size_t i, index = 0;
+	size_t pwrErrBoundBytes_size = 0, segmentL = 0, radExpoL = 0, pwrErrBoundBytesL = 0;
+	char version[3];
+	for (i = 0; i < 3; i++)
+		version[i] = flatBytes[index++]; //3
+	unsigned char sameRByte = flatBytes[index++]; //1
+	if(checkVersion2(version)!=1)
+	{
+		//wrong version
+		printf("Wrong version: \nCompressed-data version (%d.%d.%d)\n",version[0], version[1], version[2]);
+		printf("Current sz version: (%d.%d.%d)\n", versionNumber[0], versionNumber[1], versionNumber[2]);
+		printf("Please double-check if the compressed data (or file) is correct.\n");
+		exit(0);
+	}
+															      //note that 1000,0000 is reserved for regression tag.
+	int same = sameRByte & 0x01; 											//0000,0001
+	(*this)->isLossless = (sameRByte & 0x10)>>4; 							//0001,0000
+	int isPW_REL = (sameRByte & 0x20)>>5; 									//0010,0000
+	exe_params->SZ_SIZE_TYPE = ((sameRByte & 0x40)>>6)==1?8:4; 				//0100,0000
+	//confparams_dec->randomAccess = (sameRByte & 0x02) >> 1;
+	//confparams_dec->szMode = (sameRByte & 0x06) >> 1;			//0000,0110 (in fact, this szMode could be removed because convertSZParamsToBytes will overwrite it)
+	
+	confparams_dec->protectValueRange = (sameRByte & 0x04)>>2;
+	
+	confparams_dec->accelerate_pw_rel_compression = (sameRByte & 0x08) >> 3;//0000,1000
+
+	int errorBoundMode = ABS;
+	if(isPW_REL)
+	{
+		errorBoundMode = PW_REL;
+		segmentL = exe_params->SZ_SIZE_TYPE;
+		pwrErrBoundBytesL = 4;
+	}
+	
+	if(confparams_dec==NULL)
+	{
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+		memset(confparams_dec, 0, sizeof(sz_params));
+	}	
+	convertBytesToSZParams(&(flatBytes[index]), confparams_dec);
+	
+	index += MetaDataByteLength;
+
+	int isRegression = (sameRByte >> 7) & 0x01;
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		dsLengthBytes[i] = flatBytes[index++];
+	(*this)->dataSeriesLength = bytesToSize(dsLengthBytes);// 4 or 8	
+	
+	if((*this)->isLossless==1)
+	{
+		//(*this)->exactMidBytes = flatBytes+8;
+		return errorBoundMode;
+	}
+	else if(same==1)
+	{
+		(*this)->allSameData = 1;
+		//size_t exactMidBytesLength = sizeof(double);//flatBytesLength - 3 - 1 - MetaDataByteLength -exe_params->SZ_SIZE_TYPE;
+		(*this)->exactMidBytes = &(flatBytes[index]);
+		return errorBoundMode;
+	}
+	else
+		(*this)->allSameData = 0;
+	if(isRegression == 1)
+	{
+		(*this)->raBytes_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE;
+		(*this)->raBytes = &(flatBytes[index]);
+		return errorBoundMode;
+	}			
+
+	int rtype_ = 0;//sameRByte & 0x08;		//=00001000
+	unsigned char byteBuf[8];
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	int max_quant_intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	confparams_dec->maxRangeRadius = max_quant_intervals/2;
+
+	if(errorBoundMode>=PW_REL)
+	{
+		(*this)->radExpo = flatBytes[index++];//1
+		radExpoL = 1;
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			byteBuf[i] = flatBytes[index++];
+		confparams_dec->segment_size = (*this)->segment_size = bytesToSize(byteBuf);// exe_params->SZ_SIZE_TYPE	
+
+		for (i = 0; i < 4; i++)
+			byteBuf[i] = flatBytes[index++];
+		pwrErrBoundBytes_size = (*this)->pwrErrBoundBytes_size = bytesToInt_bigEndian(byteBuf);// 4		
+	}
+	else
+	{
+		pwrErrBoundBytes_size = 0;
+		(*this)->pwrErrBoundBytes = NULL;
+	}
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->medianValue = bytesToFloat(byteBuf); //4
+	
+	(*this)->reqLength = flatBytes[index++]; //1
+	
+	if(isPW_REL && confparams_dec->accelerate_pw_rel_compression)
+	{
+		(*this)->plus_bits = flatBytes[index++];
+		(*this)->max_bits = flatBytes[index++];
+	}
+	
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->realPrecision = bytesToDouble(byteBuf);//8
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->typeArray_size = bytesToSize(byteBuf);// 4		
+	if(rtype_!=0)
+	{
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++) 
+			byteBuf[i] = flatBytes[index++];
+		(*this)->rtypeArray_size = bytesToSize(byteBuf);//(ST)
+	}
+	else
+		(*this)->rtypeArray_size = 0;
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactDataNum = bytesToSize(byteBuf);// ST
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactMidBytes_size = bytesToSize(byteBuf);// ST
+
+	if (rtype_ != 0) {
+		if((*this)->rtypeArray_size>0)
+			(*this)->rtypeArray = (unsigned char*)malloc(sizeof(unsigned char)*(*this)->rtypeArray_size);
+		else
+			(*this)->rtypeArray = NULL;
+
+		for (i = 0; i < 4; i++)
+			byteBuf[i] = flatBytes[index++];
+		(*this)->reservedValue = bytesToFloat(byteBuf);//4
+	}
+
+	size_t logicLeadNumBitsNum = (*this)->exactDataNum * 2;
+	if (logicLeadNumBitsNum % 8 == 0)
+	{
+		(*this)->leadNumArray_size = logicLeadNumBitsNum >> 3;
+	}
+	else
+	{
+		(*this)->leadNumArray_size = (logicLeadNumBitsNum >> 3) + 1;
+	}
+
+	int minLogValueSize = 0;
+	if(errorBoundMode>=PW_REL)
+		minLogValueSize = 4;
+
+	if ((*this)->rtypeArray != NULL) 
+	{
+		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 4 - 1 - 8 
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - minLogValueSize - exe_params->SZ_SIZE_TYPE - 4 - (*this)->rtypeArray_size
+				- minLogValueSize - (*this)->typeArray_size - (*this)->leadNumArray_size
+				- (*this)->exactMidBytes_size - pwrErrBoundBytes_size - 1 - 1;
+		for (i = 0; i < (*this)->rtypeArray_size; i++)
+			(*this)->rtypeArray[i] = flatBytes[index++];
+	}
+	else
+	{
+		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 4 - 1 - 8 
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - minLogValueSize - (*this)->typeArray_size
+				- (*this)->leadNumArray_size - (*this)->exactMidBytes_size - pwrErrBoundBytes_size - 1 - 1;
+	}
+
+	if(errorBoundMode>=PW_REL)
+	{
+		(*this)->minLogValue = bytesToFloat(&flatBytes[index]);
+		index+=4;
+	}
+
+	(*this)->typeArray = &flatBytes[index]; 
+	//retrieve the number of states (i.e., stateNum)
+	(*this)->allNodes = bytesToInt_bigEndian((*this)->typeArray); //the first 4 bytes store the stateNum
+	(*this)->stateNum = ((*this)->allNodes+1)/2;	
+
+	index+=(*this)->typeArray_size;
+	
+	(*this)->pwrErrBoundBytes = &flatBytes[index];
+	
+	index+=pwrErrBoundBytes_size;
+	
+	(*this)->leadNumArray = &flatBytes[index];
+	
+	index+=(*this)->leadNumArray_size;
+	
+	(*this)->exactMidBytes = &flatBytes[index];
+	
+	index+=(*this)->exactMidBytes_size;
+	
+	(*this)->residualMidBits = &flatBytes[index];
+	
+	//index+=(*this)->residualMidBits_size;
+	
+	return errorBoundMode;
+}
+
+/**
+ *
+ * type's length == dataSeriesLength
+ * exactMidBytes's length == exactMidBytes_size
+ * leadNumIntArray's length == exactDataNum
+ * escBytes's length == escBytes_size
+ * resiBitLength's length == resiBitLengthSize
+ * */
+void new_TightDataPointStorageF(TightDataPointStorageF **this,
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char resiBitLength, 
+		double realPrecision, float medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo) {
+	
+	*this = (TightDataPointStorageF *)malloc(sizeof(TightDataPointStorageF));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->medianValue = medianValue;
+	(*this)->reqLength = reqLength;
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+		(*this)->max_bits = encode_withTree_MSST19(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	else
+		encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+		
+	(*this)->exactMidBytes = exactMidBytes;
+	(*this)->exactMidBytes_size = exactMidBytes_size;
+
+	(*this)->leadNumArray_size = convertIntArray2ByteArray_fast_2b(leadNumIntArray, exactDataNum, &((*this)->leadNumArray));
+
+	(*this)->residualMidBits_size = convertIntArray2ByteArray_fast_dynamic(resiMidBits, resiBitLength, exactDataNum, &((*this)->residualMidBits));
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		(*this)->pwrErrBoundBytes = pwrErrBoundBytes;
+	else
+		(*this)->pwrErrBoundBytes = NULL;
+		
+	(*this)->radExpo = radExpo;
+	
+	(*this)->pwrErrBoundBytes_size = pwrErrBoundBytes_size;
+}
+
+void new_TightDataPointStorageF2(TightDataPointStorageF **this,
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char* resiBitLength, size_t resiBitLengthSize, 
+		double realPrecision, float medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo) {
+	//int i = 0;
+	*this = (TightDataPointStorageF *)malloc(sizeof(TightDataPointStorageF));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->medianValue = medianValue;
+	(*this)->reqLength = reqLength;
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	(*this)->exactMidBytes = exactMidBytes;
+	(*this)->exactMidBytes_size = exactMidBytes_size;
+
+	(*this)->leadNumArray_size = convertIntArray2ByteArray_fast_2b(leadNumIntArray, exactDataNum, &((*this)->leadNumArray));
+
+	//(*this)->residualMidBits = resiMidBits;
+	//(*this)->residualMidBits_size = resiMidBits_size;
+
+	(*this)->residualMidBits_size = convertIntArray2ByteArray_fast_dynamic2(resiMidBits, resiBitLength, resiBitLengthSize, &((*this)->residualMidBits));
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		(*this)->pwrErrBoundBytes = pwrErrBoundBytes;
+	else
+		(*this)->pwrErrBoundBytes = NULL;
+		
+	(*this)->radExpo = radExpo;
+	
+	(*this)->pwrErrBoundBytes_size = pwrErrBoundBytes_size;
+}
+
+void convertTDPStoBytes_float(TightDataPointStorageF* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	unsigned char intervalsBytes[4];
+	unsigned char typeArrayLengthBytes[8];
+	unsigned char exactLengthBytes[8];
+	unsigned char exactMidBytesLength[8];
+	unsigned char realPrecisionBytes[8];
+	
+	unsigned char medianValueBytes[4];
+	
+	unsigned char segment_sizeBytes[8];
+	unsigned char pwrErrBoundBytes_sizeBytes[4];
+	unsigned char max_quant_intervals_Bytes[4];
+	
+	
+	for(i = 0;i<3;i++)//3 bytes
+		bytes[k++] = versionNumber[i];
+	bytes[k++] = sameByte;	//1	byte
+	
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength;
+	
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST: 4 or 8 bytes
+		bytes[k++] = dsLengthBytes[i];	
+	intToBytes_bigEndian(max_quant_intervals_Bytes, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = max_quant_intervals_Bytes[i];		
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		bytes[k++] = tdps->radExpo; //1 byte			
+		
+		sizeToBytes(segment_sizeBytes, confparams_cpr->segment_size);
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+			bytes[k++] = segment_sizeBytes[i];				
+			
+		intToBytes_bigEndian(pwrErrBoundBytes_sizeBytes, tdps->pwrErrBoundBytes_size);
+		for(i = 0;i<4;i++)//4
+			bytes[k++] = pwrErrBoundBytes_sizeBytes[i];					
+	}
+	
+	intToBytes_bigEndian(intervalsBytes, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = intervalsBytes[i];			
+	
+	floatToBytes(medianValueBytes, tdps->medianValue);
+	for (i = 0; i < 4; i++)// 4
+		bytes[k++] = medianValueBytes[i];		
+
+	bytes[k++] = tdps->reqLength; //1 byte
+
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+	{
+		bytes[k++] = tdps->plus_bits;
+		bytes[k++] = tdps->max_bits;
+	}
+
+	doubleToBytes(realPrecisionBytes, tdps->realPrecision);
+
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = realPrecisionBytes[i];			
+
+	sizeToBytes(typeArrayLengthBytes, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = typeArrayLengthBytes[i];
+
+	sizeToBytes(exactLengthBytes, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactLengthBytes[i];
+
+	sizeToBytes(exactMidBytesLength, tdps->exactMidBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactMidBytesLength[i];
+
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		floatToBytes(exactMidBytesLength, tdps->minLogValue);
+		for(i=0;i<4;i++)
+			bytes[k++] = exactMidBytesLength[i];
+	}
+
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		memcpy(&(bytes[k]), tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size);
+		k += tdps->pwrErrBoundBytes_size;
+	}
+
+	memcpy(&(bytes[k]), tdps->leadNumArray, tdps->leadNumArray_size);
+	k += tdps->leadNumArray_size;
+	memcpy(&(bytes[k]), tdps->exactMidBytes, tdps->exactMidBytes_size);
+	k += tdps->exactMidBytes_size;
+
+	if(tdps->residualMidBits!=NULL)
+	{
+		memcpy(&(bytes[k]), tdps->residualMidBits, tdps->residualMidBits_size);
+		k += tdps->residualMidBits_size;
+	}	
+}
+
+/*deprecated*/
+void convertTDPStoBytes_float_reserve(TightDataPointStorageF* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	unsigned char intervalsBytes[4];
+	unsigned char typeArrayLengthBytes[8];
+	unsigned char rTypeLengthBytes[8];
+	unsigned char exactLengthBytes[8];
+	unsigned char exactMidBytesLength[8];
+	unsigned char realPrecisionBytes[8];
+	unsigned char reservedValueBytes[4];
+	
+	unsigned char medianValueBytes[4];
+	
+	unsigned char segment_sizeBytes[8];
+	unsigned char pwrErrBoundBytes_sizeBytes[4];
+	unsigned char max_quant_intervals_Bytes[4];	
+	
+	for(i = 0;i<3;i++)//3
+		bytes[k++] = versionNumber[i];		
+	bytes[k++] = sameByte;			//1
+
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength;
+	
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = dsLengthBytes[i];		
+
+
+	intToBytes_bigEndian(max_quant_intervals_Bytes, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = max_quant_intervals_Bytes[i];
+
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		bytes[k++] = tdps->radExpo; //1 byte			
+		
+		sizeToBytes(segment_sizeBytes, confparams_cpr->segment_size);
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+			bytes[k++] = segment_sizeBytes[i];				
+			
+		intToBytes_bigEndian(pwrErrBoundBytes_sizeBytes, tdps->pwrErrBoundBytes_size);
+		for(i = 0;i<4;i++)//4
+			bytes[k++] = pwrErrBoundBytes_sizeBytes[i];					
+	}
+	
+	intToBytes_bigEndian(intervalsBytes, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = intervalsBytes[i];	
+
+	floatToBytes(medianValueBytes, tdps->medianValue);
+	for (i = 0; i < 4; i++)// 4
+		bytes[k++] = medianValueBytes[i];		
+
+	bytes[k++] = tdps->reqLength; //1 byte
+
+	floatToBytes(realPrecisionBytes, tdps->realPrecision);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = realPrecisionBytes[i];
+
+	sizeToBytes(typeArrayLengthBytes, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = typeArrayLengthBytes[i];
+
+	sizeToBytes(rTypeLengthBytes, tdps->rtypeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = rTypeLengthBytes[i];
+
+	sizeToBytes(exactLengthBytes, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactLengthBytes[i];
+
+	sizeToBytes(exactMidBytesLength, tdps->exactMidBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactMidBytesLength[i];
+
+	floatToBytes(reservedValueBytes, tdps->reservedValue);
+	for (i = 0; i < 4; i++)// 4
+		bytes[k++] = reservedValueBytes[i];
+
+	memcpy(&(bytes[k]), tdps->rtypeArray, tdps->rtypeArray_size);
+	k += tdps->rtypeArray_size;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		floatToBytes(exactMidBytesLength, tdps->minLogValue);
+		for(i=0;i<4;i++)
+			bytes[k++] = exactMidBytesLength[i];
+	}	
+	
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		memcpy(&(bytes[k]), tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size);
+		k += tdps->pwrErrBoundBytes_size;
+	}
+	memcpy(&(bytes[k]), tdps->leadNumArray, tdps->leadNumArray_size);
+	k += tdps->leadNumArray_size;
+	memcpy(&(bytes[k]), tdps->exactMidBytes, tdps->exactMidBytes_size);
+	k += tdps->exactMidBytes_size;
+	if(tdps->residualMidBits!=NULL)
+	{
+		memcpy(&(bytes[k]), tdps->residualMidBits, tdps->residualMidBits_size);
+		k += tdps->residualMidBits_size;
+	}	
+}
+
+//convert TightDataPointStorageD to bytes...
+void convertTDPStoFlatBytes_float(TightDataPointStorageF *tdps, unsigned char** bytes, size_t *size)
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+		
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0; //0000,0001
+	//sameByte = sameByte | (confparams_cpr->szMode << 1);  //0000,0110 (no need because of convertSZParamsToBytes
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);  // 0001,0000
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		sameByte = (unsigned char) (sameByte | 0x20); // 0010,0000, the 5th bit
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 0100,0000, the 6th bit
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+		sameByte = (unsigned char) (sameByte | 0x08); //0000,1000
+	if(confparams_cpr->protectValueRange)
+		sameByte = (unsigned char) (sameByte | 0x04); //0000,0100
+	
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactMidBytes_size;
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		for (i = 0; i < 3; i++)//3
+			(*bytes)[k++] = versionNumber[i];
+		(*bytes)[k++] = sameByte;
+		
+		convertSZParamsToBytes(confparams_cpr, &((*bytes)[k]));
+		k = k + MetaDataByteLength;
+				
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			(*bytes)[k++] = dsLengthBytes[i];
+		
+		for (i = 0; i < tdps->exactMidBytes_size; i++)
+			(*bytes)[k++] = tdps->exactMidBytes[i];
+
+		*size = totalByteLength;
+	}
+	else if (tdps->rtypeArray == NULL)
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		int minLogValueSize = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+			minLogValueSize = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 4 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + minLogValueSize
+				+ tdps->typeArray_size + tdps->leadNumArray_size 
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+		if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+			totalByteLength += (1+1); // for MSST19
+
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		convertTDPStoBytes_float(tdps, *bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+	else //the case with reserved value
+	{
+		//TODO
+	}
+}
+
+void convertTDPStoFlatBytes_float_args(TightDataPointStorageF *tdps, unsigned char* bytes, size_t *size)
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+		
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		sameByte = (unsigned char) (sameByte | 0x20); // 00100000, the 5th bit
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+	if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+		sameByte = (unsigned char) (sameByte | 0x08); 	
+				
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactMidBytes_size;
+		//*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		for (i = 0; i < 3; i++)//3
+			bytes[k++] = versionNumber[i];
+		bytes[k++] = sameByte;
+
+		convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+		k = k + MetaDataByteLength;
+
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			bytes[k++] = dsLengthBytes[i];		
+		for (i = 0; i < tdps->exactMidBytes_size; i++)
+			bytes[k++] = tdps->exactMidBytes[i];
+
+		*size = totalByteLength;
+	}
+	else if (tdps->rtypeArray == NULL)
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 4 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE  
+				+ tdps->typeArray_size + tdps->leadNumArray_size 
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+		if(confparams_cpr->errorBoundMode == PW_REL && confparams_cpr->accelerate_pw_rel_compression)
+			totalByteLength += (1+1); // for MSST19
+		convertTDPStoBytes_float(tdps, bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+	else //the case with reserved value
+	{
+		//TODO
+	}
+}
+
+/**
+ * to free the memory used in the compression
+ * */
+void free_TightDataPointStorageF(TightDataPointStorageF *tdps)
+{
+	if(tdps->rtypeArray!=NULL)
+		free(tdps->rtypeArray);
+	if(tdps->typeArray!=NULL)
+		free(tdps->typeArray);
+	if(tdps->leadNumArray!=NULL)
+		free(tdps->leadNumArray);
+	if(tdps->exactMidBytes!=NULL)
+		free(tdps->exactMidBytes);
+	if(tdps->residualMidBits!=NULL)
+		free(tdps->residualMidBits);
+	if(tdps->pwrErrBoundBytes!=NULL)
+		free(tdps->pwrErrBoundBytes);
+	free(tdps);
+}
+
+/**
+ * to free the memory used in the decompression
+ * */
+void free_TightDataPointStorageF2(TightDataPointStorageF *tdps)
+{			
+	free(tdps);
+}
--- a/deps/SZ/sz/src/TightDataPointStorageI.c
+++ b/deps/SZ/sz/src/TightDataPointStorageI.c
@ -0,0 +1,463 @@
+/**
+ *  @file TightPointDataStorageI.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief The functions used to construct the tightPointDataStorage element for storing compressed bytes.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "Huffman.h"
+//#include "rw.h"
+
+int computeRightShiftBits(int exactByteSize, int dataType)
+{
+	int rightShift = 0; 
+	switch(dataType)
+	{
+	case SZ_INT8:
+	case SZ_UINT8:
+		rightShift = 8 - exactByteSize*8;
+		break;
+	case SZ_INT16:
+	case SZ_UINT16:
+		rightShift = 16 - exactByteSize*8;
+		break;
+	case SZ_INT32:
+	case SZ_UINT32:
+		rightShift = 32 - exactByteSize*8;
+		break;
+	case SZ_INT64:
+	case SZ_UINT64:
+		rightShift = 64 - exactByteSize*8;
+		break;
+	}
+	return rightShift;	
+}
+
+int convertDataTypeSizeCode(int dataTypeSizeCode)
+{
+	int result = 0;
+	switch(dataTypeSizeCode)
+	{
+	case 0:
+		result = 1;
+		break;
+	case 1:
+		result = 2;
+		break;
+	case 2:
+		result = 4;
+		break;
+	case 3:
+		result = 8;
+		break;
+	}
+	return result;	
+}
+
+int convertDataTypeSize(int dataTypeSize)
+{
+	int result = 0;
+	switch(dataTypeSize)
+	{
+	case 1:
+		result = 0; //0000
+		break;
+	case 2:
+		result = 4; //0100
+		break;
+	case 4:
+		result = 8; //1000
+		break;
+	case 8:
+		result = 12; //1100
+		break;
+	}
+	return result;
+}
+
+void new_TightDataPointStorageI_Empty(TightDataPointStorageI **this)
+{
+	*this = (TightDataPointStorageI*)malloc(sizeof(TightDataPointStorageI));
+
+	(*this)->dataSeriesLength = 0;
+	(*this)->allSameData = 0;
+	(*this)->exactDataNum = 0;
+	(*this)->realPrecision = 0;
+	(*this)->minValue = 0;
+	(*this)->exactByteSize = 0;
+
+	(*this)->typeArray = NULL; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	(*this)->typeArray_size = 0;
+	
+	(*this)->exactDataBytes = NULL;
+	(*this)->exactDataBytes_size = 0;
+
+	(*this)->intervals = 0;
+	(*this)->isLossless = 0;	
+}
+
+int new_TightDataPointStorageI_fromFlatBytes(TightDataPointStorageI **this, unsigned char* flatBytes, size_t flatBytesLength)
+{
+	new_TightDataPointStorageI_Empty(this);
+	size_t i, index = 0;
+	char version[3];
+	for (i = 0; i < 3; i++)
+		version[i] = flatBytes[index++]; //3
+	unsigned char sameRByte = flatBytes[index++]; //1
+	if(checkVersion2(version)!=1)
+	{
+		//wrong version
+		printf("Wrong version: \nCompressed-data version (%d.%d.%d)\n",version[0], version[1], version[2]);
+		printf("Current sz version: (%d.%d.%d)\n", versionNumber[0], versionNumber[1], versionNumber[2]);
+		printf("Please double-check if the compressed data (or file) is correct.\n");
+		exit(0);
+	}
+	int same = sameRByte & 0x01;
+	//conf_params->szMode = (sameRByte & 0x06)>>1;
+	int dataByteSizeCode = (sameRByte & 0x0C)>>2;
+	convertDataTypeSizeCode(dataByteSizeCode); //in bytes
+	(*this)->isLossless = (sameRByte & 0x10)>>4;
+
+	exe_params->SZ_SIZE_TYPE = ((sameRByte & 0x40)>>6)==1?8:4;
+	int errorBoundMode = ABS;
+	
+	if(confparams_dec==NULL)
+	{
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+		memset(confparams_dec, 0, sizeof(sz_params));
+	}	
+	convertBytesToSZParams(&(flatBytes[index]), confparams_dec);
+	/*sz_params* params = convertBytesToSZParams(&(flatBytes[index]));
+	int mode = confparams_dec->szMode;
+	int losslessCompressor = confparams_dec->losslessCompressor;
+	if(confparams_dec!=NULL)
+		free(confparams_dec);
+	confparams_dec = params;
+	confparams_dec->szMode = mode;
+	confparams_dec->losslessCompressor = losslessCompressor;*/
+	
+	index += MetaDataByteLength; //20	
+	
+	if(same==0)
+		(*this)->exactByteSize = flatBytes[index++]; //1
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		dsLengthBytes[i] = flatBytes[index++];
+	(*this)->dataSeriesLength = bytesToSize(dsLengthBytes);// ST
+	if((*this)->isLossless==1)
+	{
+		//(*this)->exactMidBytes = flatBytes+8;
+		return errorBoundMode;
+	}
+	else if(same==1)
+	{
+		(*this)->allSameData = 1;
+		(*this)->exactDataBytes = &(flatBytes[index]);
+		return errorBoundMode;
+	}
+	else
+		(*this)->allSameData = 0;
+
+	unsigned char byteBuf[8];
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	int max_quant_intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	confparams_dec->maxRangeRadius = max_quant_intervals/2;
+
+	if(errorBoundMode>=PW_REL)
+	{
+		printf("Error: errorBoundMode>=PW_REL in new_TightDataPointStorageI_fromFlatBytes!! Wrong...\n");
+		exit(0);
+	}
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->minValue = bytesToLong_bigEndian(byteBuf); //8
+		
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->realPrecision = bytesToDouble(byteBuf);//8
+	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->typeArray_size = bytesToSize(byteBuf);// ST		
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactDataNum = bytesToSize(byteBuf);// ST
+	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactDataBytes_size = bytesToSize(byteBuf);// ST		
+
+
+	(*this)->typeArray = &flatBytes[index];
+	//retrieve the number of states (i.e., stateNum)
+	(*this)->allNodes = bytesToInt_bigEndian((*this)->typeArray); //the first 4 bytes store the stateNum
+	(*this)->stateNum = ((*this)->allNodes+1)/2;		
+
+	index+=(*this)->typeArray_size;
+	
+	if((*this)->exactDataBytes_size > 0)
+	{	
+		(*this)->exactDataBytes = &flatBytes[index];
+		index+=(*this)->exactDataBytes_size*sizeof(char);	
+	}
+	else
+		(*this)->exactDataBytes = NULL;	
+	return errorBoundMode;
+}
+
+/**
+ *
+ * type's length == dataSeriesLength
+ * exactDataBytes's length == exactDataBytes_size
+ * */
+void new_TightDataPointStorageI(TightDataPointStorageI **this,
+		size_t dataSeriesLength, size_t exactDataNum, int byteSize, 
+		int* type, unsigned char* exactDataBytes, size_t exactDataBytes_size,
+		double realPrecision, long minValue, int intervals, int dataType) 
+{
+	//int i = 0;
+	*this = (TightDataPointStorageI *)malloc(sizeof(TightDataPointStorageI));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->minValue = minValue;
+	switch(dataType)
+	{
+	case SZ_INT8:
+	case SZ_UINT8:
+		(*this)->dataTypeSize = 1;
+		break;
+	case SZ_INT16:
+	case SZ_UINT16:
+		(*this)->dataTypeSize = 2;
+		break;
+	case SZ_INT32:
+	case SZ_UINT32:
+		(*this)->dataTypeSize = 4;
+		break;
+	case SZ_INT64:
+	case SZ_UINT64:
+		(*this)->dataTypeSize = 8;
+		break;
+	}
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+	(*this)->exactByteSize = byteSize;
+
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+		
+	(*this)->exactDataBytes = exactDataBytes;
+	(*this)->exactDataBytes_size = exactDataBytes_size;
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+}
+
+void convertTDPStoBytes_int(TightDataPointStorageI* tdps, unsigned char* bytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	
+	unsigned char byteBuffer[8] = {0,0,0,0,0,0,0,0};
+	
+	for(i = 0;i<3;i++)//3 bytes
+		bytes[k++] = versionNumber[i];
+	bytes[k++] = sameByte;	//1	byte
+	
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength;	
+		
+	bytes[k++] = tdps->exactByteSize; //1 byte
+
+	sizeToBytes(byteBuffer, tdps->dataSeriesLength);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST: 4 or 8 bytes
+		bytes[k++] = byteBuffer[i];	
+	
+	intToBytes_bigEndian(byteBuffer, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = byteBuffer[i];
+	
+	intToBytes_bigEndian(byteBuffer, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = byteBuffer[i];			
+	
+	longToBytes_bigEndian(byteBuffer, tdps->minValue);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = byteBuffer[i];
+
+	doubleToBytes(byteBuffer, tdps->realPrecision);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = byteBuffer[i];			
+
+	sizeToBytes(byteBuffer, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = byteBuffer[i];
+
+	sizeToBytes(byteBuffer, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = byteBuffer[i];
+
+	sizeToBytes(byteBuffer, tdps->exactDataBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = byteBuffer[i];
+
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+
+	memcpy(&(bytes[k]), tdps->exactDataBytes, tdps->exactDataBytes_size);
+	k += tdps->exactDataBytes_size;
+}
+
+//convert TightDataPointStorageI to bytes...
+void convertTDPStoFlatBytes_int(TightDataPointStorageI *tdps, unsigned char** bytes, size_t *size)
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);
+	
+	int dataTypeSizeCode = convertDataTypeSize(tdps->dataTypeSize);
+	sameByte = (unsigned char) (sameByte | dataTypeSizeCode);
+	
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+	
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactDataBytes_size;
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		for (i = 0; i < 3; i++)//3
+			(*bytes)[k++] = versionNumber[i];
+		(*bytes)[k++] = sameByte;//1
+		
+		convertSZParamsToBytes(confparams_cpr, &((*bytes)[k]));
+		k = k + MetaDataByteLength;			
+		
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			(*bytes)[k++] = dsLengthBytes[i];
+		
+		for (i = 0; i < tdps->exactDataBytes_size; i++)
+			(*bytes)[k++] = tdps->exactDataBytes[i];
+
+		*size = totalByteLength;
+	}
+	else 
+	{
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			printf("Error: errorBoundMode >= PW_REL!! can't be...\n");
+			exit(0);
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + 1 + exe_params->SZ_SIZE_TYPE + 4 + 4 + 8 + 8
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE
+				+ tdps->typeArray_size + tdps->exactDataBytes_size;
+
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		convertTDPStoBytes_int(tdps, *bytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+}
+
+void convertTDPStoFlatBytes_int_args(TightDataPointStorageI *tdps, unsigned char* bytes, size_t *size)
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+		
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+		
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactDataBytes_size;
+		//*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		for (i = 0; i < 3; i++)//3
+			bytes[k++] = versionNumber[i];
+		bytes[k++] = sameByte;//1
+		
+		convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+		k = k + MetaDataByteLength;	
+				
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)//ST
+			bytes[k++] = dsLengthBytes[i];		
+		for (i = 0; i < tdps->exactDataBytes_size; i++)
+			bytes[k++] = tdps->exactDataBytes[i];
+
+		*size = totalByteLength;
+	}
+	else
+	{
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			printf("Error: errorBoundMode>=PW_REL!! can't be....\n");
+			exit(0);
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + 4 + 4 + 8 + 8
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE  
+				+ tdps->typeArray_size + tdps->exactDataBytes_size;
+
+		convertTDPStoBytes_int(tdps, bytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+}
+
+void free_TightDataPointStorageI(TightDataPointStorageI *tdps)
+{
+	if(tdps->typeArray!=NULL)
+		free(tdps->typeArray);
+	if(tdps->exactDataBytes!=NULL)
+		free(tdps->exactDataBytes);
+	free(tdps);
+}
+
+void free_TightDataPointStorageI2(TightDataPointStorageI *tdps)
+{
+	free(tdps);
+}
+
+
--- a/deps/SZ/sz/src/TypeManager.c
+++ b/deps/SZ/sz/src/TypeManager.c
@ -0,0 +1,503 @@
+/**
+ *  @file TypeManager.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief TypeManager is used to manage the type array: parsing of the bytes and other types in between.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "DynamicByteArray.h"
+#include "sz.h"
+
+//int convertIntArray2ByteArray_fast_8b()
+
+size_t convertIntArray2ByteArray_fast_1b(unsigned char* intArray, size_t intArrayLength, unsigned char **result)
+{
+	size_t byteLength = 0;
+	size_t i, j; 
+	if(intArrayLength%8==0)
+		byteLength = intArrayLength/8;
+	else
+		byteLength = intArrayLength/8+1;
+		
+	if(byteLength>0)
+		*result = (unsigned char*)malloc(byteLength*sizeof(unsigned char));
+	else
+		*result = NULL;
+	size_t n = 0;
+	int tmp, type;
+	for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+		for(j = 0;j<8&&n<intArrayLength;j++)
+		{
+			type = intArray[n];
+			if(type == 1)
+				tmp = (tmp | (1 << (7-j)));
+			n++;
+		}
+    	(*result)[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+size_t convertIntArray2ByteArray_fast_1b_to_result(unsigned char* intArray, size_t intArrayLength, unsigned char *result)
+{
+	size_t byteLength = 0;
+	size_t i, j; 
+	if(intArrayLength%8==0)
+		byteLength = intArrayLength/8;
+	else
+		byteLength = intArrayLength/8+1;
+		
+	size_t n = 0;
+	int tmp, type;
+	for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+		for(j = 0;j<8&&n<intArrayLength;j++)
+		{
+			type = intArray[n];
+			if(type == 1)
+				tmp = (tmp | (1 << (7-j)));
+			n++;
+		}
+    	result[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+void convertByteArray2IntArray_fast_1b(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)	
+{
+    if(intArrayLength > byteArrayLength*8)
+    {
+    	printf("Error: intArrayLength > byteArrayLength*8\n");
+    	printf("intArrayLength=%zu, byteArrayLength = %zu", intArrayLength, byteArrayLength);
+    	exit(0);
+    }
+	if(intArrayLength>0)
+		*intArray = (unsigned char*)malloc(intArrayLength*sizeof(unsigned char));
+	else
+		*intArray = NULL;    
+    
+	size_t n = 0, i;
+	int tmp;
+	for (i = 0; i < byteArrayLength-1; i++) 
+	{
+		tmp = byteArray[i];
+		(*intArray)[n++] = (tmp & 0x80) >> 7;
+		(*intArray)[n++] = (tmp & 0x40) >> 6;
+		(*intArray)[n++] = (tmp & 0x20) >> 5;
+		(*intArray)[n++] = (tmp & 0x10) >> 4;
+		(*intArray)[n++] = (tmp & 0x08) >> 3;
+		(*intArray)[n++] = (tmp & 0x04) >> 2;
+		(*intArray)[n++] = (tmp & 0x02) >> 1;
+		(*intArray)[n++] = (tmp & 0x01) >> 0;		
+	}
+	
+	tmp = byteArray[i];	
+	if(n == intArrayLength)
+		return;
+	(*intArray)[n++] = (tmp & 0x80) >> 7;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x40) >> 6;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x20) >> 5;
+	if(n == intArrayLength)
+		return;
+	(*intArray)[n++] = (tmp & 0x10) >> 4;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x08) >> 3;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x04) >> 2;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x02) >> 1;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x01) >> 0;		
+}
+
+/**
+ * little endian
+ * [01|10|11|00|....]-->[01|10|11|00][....]
+ * @param timeStepType
+ * @return
+ */
+size_t convertIntArray2ByteArray_fast_2b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result)
+{
+	size_t i, j, byteLength = 0;
+	if(timeStepTypeLength%4==0)
+		byteLength = timeStepTypeLength*2/8;
+	else
+		byteLength = timeStepTypeLength*2/8+1;
+	if(byteLength>0)
+		*result = (unsigned char*)malloc(byteLength*sizeof(unsigned char));
+	else
+		*result = NULL;
+	size_t n = 0;
+	for(i = 0;i<byteLength;i++)
+	{
+		int tmp = 0;
+		for(j = 0;j<4&&n<timeStepTypeLength;j++)
+		{
+			int type = timeStepType[n];
+			switch(type)
+			{
+			case 0: 
+				
+				break;
+			case 1:
+				tmp = (tmp | (1 << (6-j*2)));
+				break;
+			case 2:
+				tmp = (tmp | (2 << (6-j*2)));
+				break;
+			case 3:
+				tmp = (tmp | (3 << (6-j*2)));
+				break;
+			default:
+				printf("Error: wrong timestep type...: type[%zu]=%d\n", n, type);
+				exit(0);
+			}
+			n++;
+		}
+		(*result)[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+size_t convertIntArray2ByteArray_fast_2b_inplace(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char *result)
+{
+	size_t i, j, byteLength = 0;
+	if(timeStepTypeLength%4==0)
+		byteLength = timeStepTypeLength*2/8;
+	else
+		byteLength = timeStepTypeLength*2/8+1;
+
+	size_t n = 0;
+	for(i = 0;i<byteLength;i++)
+	{
+		int tmp = 0;
+		/*for(j = 0;j<4&&n<timeStepTypeLength;j++)
+		{
+			int type = timeStepType[n];
+			switch(type)
+			{
+			case 0: 
+				
+				break;
+			case 1:
+				tmp = (tmp | (1 << (6-j*2)));
+				break;
+			case 2:
+				tmp = (tmp | (2 << (6-j*2)));
+				break;
+			case 3:
+				tmp = (tmp | (3 << (6-j*2)));
+				break;
+			default:
+				printf("Error: wrong timestep type...: type[%zu]=%d\n", n, type);
+				exit(0);
+			}
+			n++;
+		}*/
+		for(j = 0;j<4&&n<timeStepTypeLength;j++)
+		{
+			unsigned char type = timeStepType[n];
+			tmp = tmp | type << (6-(j<<1));
+			n++;
+		}
+		result[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+void convertByteArray2IntArray_fast_2b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)
+{
+	if(stepLength > byteArrayLength*4)
+	{
+		printf("Error: stepLength > byteArray.length*4\n");
+		printf("stepLength=%zu, byteArray.length=%zu\n", stepLength, byteArrayLength);
+		exit(0);
+	}
+	if(stepLength>0)
+		*intArray = (unsigned char*)malloc(stepLength*sizeof(unsigned char));
+	else
+		*intArray = NULL;
+	size_t i, n = 0;
+
+	for (i = 0; i < byteArrayLength; i++) {
+		unsigned char tmp = byteArray[i];
+		(*intArray)[n++] = (tmp & 0xC0) >> 6;
+		if(n==stepLength)
+			break;
+		(*intArray)[n++] = (tmp & 0x30) >> 4;
+		if(n==stepLength)
+			break;
+		(*intArray)[n++] = (tmp & 0x0C) >> 2;
+		if(n==stepLength)
+			break;
+		(*intArray)[n++] = tmp & 0x03;
+		if(n==stepLength)
+			break;
+	}
+}
+
+size_t convertIntArray2ByteArray_fast_3b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result)
+{	
+	size_t i = 0, k = 0, byteLength = 0, n = 0;
+	if(timeStepTypeLength%8==0)
+		byteLength = timeStepTypeLength*3/8;
+	else
+		byteLength = timeStepTypeLength*3/8+1;
+
+	if(byteLength>0)
+		*result = (unsigned char*)malloc(byteLength*sizeof(unsigned char));
+	else
+		*result = NULL;
+	int tmp = 0;
+	for(n = 0;n<timeStepTypeLength;n++)
+	{
+		k = n%8;
+		switch(k)
+		{
+		case 0:
+			tmp = tmp | (timeStepType[n] << 5);
+			break;
+		case 1:
+			tmp = tmp | (timeStepType[n] << 2);
+			break;
+		case 2: 
+			tmp = tmp | (timeStepType[n] >> 1);
+			(*result)[i++] = (unsigned char)tmp;
+			tmp = 0 | (timeStepType[n] << 7);
+			break;
+		case 3:
+			tmp = tmp | (timeStepType[n] << 4);
+			break;
+		case 4:
+			tmp = tmp | (timeStepType[n] << 1);
+			break;
+		case 5:
+			tmp = tmp | (timeStepType[n] >> 2);
+			(*result)[i++] = (unsigned char)tmp;
+			tmp = 0 | (timeStepType[n] << 6);
+			break;
+		case 6:
+			tmp = tmp | (timeStepType[n] << 3);
+			break;
+		case 7:
+			tmp = tmp | (timeStepType[n] << 0);
+			(*result)[i++] = (unsigned char)tmp;
+			tmp = 0;
+			break;
+		}
+	}
+	if(k!=7) //load the last one
+		(*result)[i] = (unsigned char)tmp;
+	
+	return byteLength;
+}
+
+void convertByteArray2IntArray_fast_3b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)
+{	
+	if(stepLength > byteArrayLength*8/3)
+	{
+		printf("Error: stepLength > byteArray.length*8/3, impossible case unless bugs elsewhere.\n");
+		printf("stepLength=%zu, byteArray.length=%zu\n", stepLength, byteArrayLength);
+		exit(0);		
+	}
+	if(stepLength>0)
+		*intArray = (unsigned char*)malloc(stepLength*sizeof(unsigned char));
+	else
+		*intArray = NULL;
+	size_t i = 0, ii = 0, n = 0;
+	unsigned char tmp = byteArray[i];	
+	for(n=0;n<stepLength;)
+	{
+		switch(n%8)
+		{
+		case 0:
+			(*intArray)[n++] = (tmp & 0xE0) >> 5;
+			break;
+		case 1: 
+			(*intArray)[n++] = (tmp & 0x1C) >> 2;
+			break;
+		case 2:
+			ii = (tmp & 0x03) << 1;
+			i++;
+			tmp = byteArray[i];
+			ii |= (tmp & 0x80) >> 7;
+			(*intArray)[n++] = ii;
+			break;
+		case 3:
+			(*intArray)[n++] = (tmp & 0x70) >> 4;
+			break;
+		case 4:
+			(*intArray)[n++] = (tmp & 0x0E) >> 1;
+			break;
+		case 5:
+			ii = (tmp & 0x01) << 2;
+			i++;
+			tmp = byteArray[i];
+			ii |= (tmp & 0xC0) >> 6;
+			(*intArray)[n++] = ii;
+			break;
+		case 6: 
+			(*intArray)[n++] = (tmp & 0x38) >> 3;
+			break;
+		case 7:
+			(*intArray)[n++] = (tmp & 0x07);
+			i++;
+			tmp = byteArray[i];
+			break;
+		}
+	}
+}
+
+inline int getLeftMovingSteps(size_t k, unsigned char resiBitLength)
+{
+	return 8 - k%8 - resiBitLength;
+}
+
+/**
+ * 
+ * @param timeStepType is the resiMidBits
+ * @param resiBitLength is the length of resiMidBits for each element, (the number of resiBitLength == the # of unpredictable elements
+ * @return
+ */
+size_t convertIntArray2ByteArray_fast_dynamic(unsigned char* timeStepType, unsigned char resiBitLength, size_t nbEle, unsigned char **bytes)
+{
+	size_t i = 0, j = 0, k = 0; 
+	int value;
+	DynamicByteArray* dba;
+	new_DBA(&dba, 1024);
+	int tmp = 0, leftMovSteps = 0;
+	for(j = 0;j<nbEle;j++)
+	{
+		if(resiBitLength==0)
+			continue;
+		value = timeStepType[i];
+		leftMovSteps = getLeftMovingSteps(k, resiBitLength);
+		if(leftMovSteps < 0)
+		{
+			tmp = tmp | (value >> (-leftMovSteps));
+			addDBA_Data(dba, (unsigned char)tmp);
+			tmp = 0 | (value << (8+leftMovSteps));
+		}
+		else if(leftMovSteps > 0)
+		{
+			tmp = tmp | (value << leftMovSteps);
+		}
+		else //==0
+		{
+			tmp = tmp | value;
+			addDBA_Data(dba, (unsigned char)tmp);
+			tmp = 0;
+		}
+		i++;
+		k += resiBitLength;
+	}
+	if(leftMovSteps != 0)
+		addDBA_Data(dba, (unsigned char)tmp);
+	convertDBAtoBytes(dba, bytes);
+	size_t size = dba->size;
+	free_DBA(dba);
+	return size;
+}
+
+/**
+ * 
+ * @param timeStepType is the resiMidBits
+ * @param resiBitLength is the length of resiMidBits for each element, (the number of resiBitLength == the # of unpredictable elements
+ * @return
+ */
+size_t convertIntArray2ByteArray_fast_dynamic2(unsigned char* timeStepType, unsigned char* resiBitLength, size_t resiBitLengthLength, unsigned char **bytes)
+{
+	size_t i = 0, j = 0, k = 0; 
+	int value;
+	DynamicByteArray* dba;
+	new_DBA(&dba, 1024);
+	int tmp = 0, leftMovSteps = 0;
+	for(j = 0;j<resiBitLengthLength;j++)
+	{
+		unsigned char rbl = resiBitLength[j];
+		if(rbl==0)
+			continue;
+		value = timeStepType[i];
+		leftMovSteps = getLeftMovingSteps(k, rbl);
+		if(leftMovSteps < 0)
+		{
+			tmp = tmp | (value >> (-leftMovSteps));
+			addDBA_Data(dba, (unsigned char)tmp);
+			tmp = 0 | (value << (8+leftMovSteps));
+		}
+		else if(leftMovSteps > 0)
+		{
+			tmp = tmp | (value << leftMovSteps);
+		}
+		else //==0
+		{
+			tmp = tmp | value;
+			addDBA_Data(dba, (unsigned char)tmp);
+			tmp = 0;
+		}
+		i++;
+		k += rbl;
+	}
+	if(leftMovSteps != 0)
+		addDBA_Data(dba, (unsigned char)tmp);
+	convertDBAtoBytes(dba, bytes);
+	size_t size = dba->size;
+	free_DBA(dba);
+	return size;
+}
+
+int computeBitNumRequired(size_t dataLength)
+{
+	if(exe_params->SZ_SIZE_TYPE==4)
+		return 32 - numberOfLeadingZeros_Int(dataLength);
+	else
+		return 64 - numberOfLeadingZeros_Long(dataLength);
+		
+}
+
+void decompressBitArraybySimpleLZ77(int** result, unsigned char* bytes, size_t bytesLength, size_t totalLength, int validLength)
+{
+	size_t pairLength = (bytesLength*8)/(validLength+1);
+	size_t tmpLength = pairLength*2;
+	int tmpResult[tmpLength];
+	size_t i, j, k = 0;
+	for(i = 0;i<tmpLength;i+=2)
+	{
+		size_t outIndex = k/8;
+		int innerIndex = k%8;
+
+		unsigned char curByte = bytes[outIndex];
+		tmpResult[i] = (curByte >> (8-1-innerIndex)) & 0x01;
+		k++;
+		
+		int numResult = extractBytes(bytes, k, validLength);
+		
+		tmpResult[i+1] = numResult;
+		k = k + validLength;
+	}
+	
+	*result = (int*)malloc(sizeof(int)*totalLength);
+	k = 0;
+	for(i = 0;i<tmpLength;i=i+2)
+	{
+		int state = tmpResult[i];
+		size_t num = tmpResult[i+1];
+		for(j = 0;j<num;j++)
+			(*result)[k++] = state;
+	}
+}
--- a/deps/SZ/sz/src/VarSet.c
+++ b/deps/SZ/sz/src/VarSet.c
@ -0,0 +1,254 @@
+/**
+ *  @file Variable.c
+ *  @author Sheng Di
+ *  @date July, 2016
+ *  @brief TypeManager is used to manage the type array: parsing of the bytes and other types in between.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "VarSet.h"
+#include "sz.h"
+
+void free_Variable_keepOriginalData(SZ_Variable* v)
+{
+	if(v->varName!=NULL)
+		free(v->varName);	
+	if(v->compressedBytes!=NULL)
+		free(v->compressedBytes);
+	if(v->multisteps!=NULL)
+		free_multisteps(v->multisteps);	
+	free(v);
+}
+
+/**
+ * 
+ * @deprecated
+ * */
+void free_Variable_keepCompressedBytes(SZ_Variable* v)
+{
+	if(v->varName!=NULL)
+		free(v->varName);
+	if(v->data!=NULL)
+		free(v->data);
+	if(v->multisteps!=NULL)
+		free_multisteps(v->multisteps);	
+	free(v);
+}
+
+void free_Variable_all(SZ_Variable* v)
+{
+	if(v->varName!=NULL)
+		free(v->varName);
+	if(v->data!=NULL)
+		free(v->data);
+	if(v->compressedBytes!=NULL)
+		free(v->compressedBytes);
+	if(v->multisteps!=NULL)
+		free_multisteps(v->multisteps);
+	free(v);
+}
+
+void SZ_batchAddVar(int var_id, char* varName, int dataType, void* data, 
+			int errBoundMode, double absErrBound, double relBoundRatio, double pwRelBoundRatio, 
+			size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{	
+	if(sz_varset==NULL)
+	{
+		sz_varset = (SZ_VarSet*)malloc(sizeof(SZ_VarSet));
+		sz_varset->header = (SZ_Variable*)malloc(sizeof(SZ_Variable));
+		sz_varset->header->next = NULL;
+		sz_varset->lastVar = sz_varset->header;
+		sz_varset->count = 0;		
+	}
+	
+	SZ_Variable* var = (SZ_Variable*)malloc(sizeof(SZ_Variable));
+	memset(var, 0, sizeof(SZ_Variable));
+	var->var_id = var_id;
+	var->varName = (char*)malloc(strlen(varName)+1);
+	memcpy(var->varName, varName, strlen(varName)+1);
+	//var->varName = varName;
+	var->dataType = dataType;
+	var->r5 = r5;
+	var->r4 = r4;
+	var->r3 = r3;
+	var->r2 = r2;
+	var->r1 = r1;
+	var->errBoundMode = errBoundMode;
+	var->absErrBound = absErrBound;
+	var->relBoundRatio = relBoundRatio;
+	var->pwRelBoundRatio = pwRelBoundRatio;
+	var->data = data;
+	
+	var->multisteps = (sz_multisteps*)malloc(sizeof(sz_multisteps));
+	memset(var->multisteps, 0, sizeof(sz_multisteps));
+	
+	size_t dataLen = computeDataLength(r5, r4, r3, r2, r1);
+	if(dataType==SZ_FLOAT)
+	{
+		var->multisteps->hist_data = (float*)malloc(sizeof(float)*dataLen);
+		memset(var->multisteps->hist_data, 0, sizeof(float)*dataLen);
+	}
+	else if(dataType==SZ_DOUBLE)
+	{
+		var->multisteps->hist_data = (double*)malloc(sizeof(double)*dataLen);
+		memset(var->multisteps->hist_data, 0, sizeof(double)*dataLen);
+	}
+	var->compressedBytes = NULL;
+	var->next = NULL;
+	
+	sz_varset->count ++;
+	sz_varset->lastVar->next = var;
+	sz_varset->lastVar = var;
+}
+
+int SZ_batchDelVar_ID(int var_id)
+{
+	int state = SZ_batchDelVar_ID_vset(sz_varset, var_id);
+	return state;
+}
+
+int SZ_batchDelVar(char* varName)
+{
+	int state = SZ_batchDelVar_vset(sz_varset, varName);
+	return state;
+}
+
+int SZ_batchDelVar_ID_vset(SZ_VarSet* vset, int var_id)
+{
+	int delSuccess = SZ_NSCS;
+	SZ_Variable* p = vset->header;
+	SZ_Variable* q = p->next;
+	while(q != NULL)
+	{
+		if(q->var_id == var_id)
+		{
+			p->next = q->next;
+			//free_Variable_all(q);
+			free_Variable_keepOriginalData(q);
+			vset->count --;
+			delSuccess = SZ_SCES;
+			if(q->next==NULL) //means that q is the last variable
+				vset->lastVar = p;			
+			break;
+		}
+			
+		p = p->next;
+		q = q->next;	
+	}
+	
+	return delSuccess;	
+}
+
+int SZ_batchDelVar_vset(SZ_VarSet* vset, char* varName)
+{
+	int delSuccess = SZ_NSCS;
+	SZ_Variable* p = vset->header;
+	SZ_Variable* q = p->next;
+	while(q != NULL)
+	{
+		int cmpResult = strcmp(q->varName, varName);
+		if(cmpResult==0)
+		{
+			p->next = q->next;
+			//free_Variable_all(q);
+			free_Variable_keepOriginalData(q);
+			vset->count --;
+			delSuccess = SZ_SCES;
+			break;
+		}
+		p = p->next;
+		q = q->next;	
+	}
+	
+	return delSuccess;
+}
+
+SZ_Variable* SZ_searchVar(char* varName)
+{
+	SZ_Variable* p = sz_varset->header->next;
+	while(p!=NULL)
+	{
+		int checkName = strcmp(p->varName, varName);
+		if(checkName==0)
+			return p;
+		p = p->next;
+	}	
+	return NULL;
+}
+
+void* SZ_getVarData(char* varName, size_t *r5, size_t *r4, size_t *r3, size_t *r2, size_t *r1)
+{
+	SZ_Variable* v = SZ_searchVar(varName);
+	*r5 = v->r5;
+	*r4 = v->r4;
+	*r3 = v->r3;
+	*r2 = v->r2;
+	*r1 = v->r1;
+	return (void*)v->data;
+}
+
+/**
+ * 
+ * int mode: SZ_MAINTAIN_VAR_DATA, Z_DESTROY_WHOLE_VARSET
+ * */
+void SZ_freeVarSet(int mode)
+{
+	free_VarSet_vset(sz_varset, mode);
+}
+
+//free_VarSet will completely destroy the SZ_VarSet, so don't do it until you really don't need it any more!
+/**
+ * 
+ * int mode: SZ_MAINTAIN_VAR_DATA, Z_DESTROY_WHOLE_VARSET
+ * */
+void free_VarSet_vset(SZ_VarSet *vset, int mode)
+{
+	if(vset==NULL)
+		return;
+	SZ_Variable *p = vset->header;
+	while(p->next!=NULL)
+	{
+		SZ_Variable *q = p->next;
+		p->next = q->next;
+		if(mode==SZ_MAINTAIN_VAR_DATA)
+			free_Variable_keepOriginalData(q);
+		else if(mode==SZ_DESTROY_WHOLE_VARSET)
+			free_Variable_all(q);
+	}
+	free(sz_varset->header);
+	free(vset);
+}
+
+void free_multisteps(sz_multisteps* multisteps)
+{
+	if(multisteps->hist_data!=NULL)
+		free(multisteps->hist_data);
+	free(multisteps);
+}
+
+inline int checkVarID(unsigned char cur_var_id, unsigned char* var_ids, int var_count)
+{
+	int j = 0;
+	for(j=0;j<var_count;j++)
+	{
+		if(var_ids[j]==cur_var_id)
+			return 1;
+	}
+	return 0;
+}
+
+SZ_Variable* SZ_getVariable(int var_id)
+{
+	SZ_Variable* p = sz_varset->header->next;
+	while(p!=NULL)
+	{
+		if(var_id == p->var_id)
+			return p;
+		p = p->next;
+	}	
+	return NULL;
+} 
--- a/deps/SZ/sz/src/callZlib.c
+++ b/deps/SZ/sz/src/callZlib.c
@ -0,0 +1,527 @@
+/**
+ *  @file callZlib.c
+ *  @author Sheng Di
+ *  @date June, 2016
+ *  @brief gzip compressor code: the interface to call zlib
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <zlib.h>
+#include <sz.h>
+
+#if MAX_MEM_LEVEL >= 8
+#define DEF_MEM_LEVEL 8
+#else
+#define DEF_MEM_LEVEL MAX_MEM_LEVEL
+#endif
+
+
+#define CHECK_ERR(err, msg) { \
+    if (err != Z_OK && err != Z_STREAM_END) { \
+        fprintf(stderr, "%s error: %d\n", msg, err); \
+        return SZ_NSCS; \
+    } \
+}
+
+int isZlibFormat(unsigned char magic1, unsigned char magic2)
+{
+	if(magic1==104&&magic2==5) //DC+BS
+		return 1;
+	if(magic1==104&&magic2==129) //DC+DC
+		return 1;
+	if(magic1==104&&magic2==222) //DC+BC
+		return 1;		
+	if(magic1==120&&magic2==1) //BC+BS
+		return 1;
+	if(magic1==120&&magic2==94) //BC+? 
+		return 1;		
+	if(magic1==120&&magic2==156) //BC+DC
+		return 1;
+	if(magic1==120&&magic2==218) //BC+BS
+		return 1;
+	return 0;
+}
+
+/*zlib_compress() is only valid for median-size data compression. */
+unsigned long zlib_compress(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level)
+{	
+	z_stream stream = {0};
+
+    stream.next_in = data;
+    stream.avail_in = dataLength;
+#ifdef MAXSEG_64K
+    /* Check for source > 64K on 16-bit machine: */
+    if ((uLong)stream.avail_in != dataLength) return Z_BUF_ERROR;
+#endif
+
+    uLong estCmpLen = deflateBound(&stream, dataLength);	
+	unsigned long outSize = estCmpLen;
+    	
+	*compressBytes = (unsigned char*)malloc(sizeof(unsigned char)*estCmpLen);
+	int err = compress2(*compressBytes, &outSize, data, dataLength, level);
+	if(err!=Z_OK)
+	{
+		printf("Error: err_code=%d; the reason may be your data size is too large (>=2^32), which cannot be compressed by standalone zlib_compress. Sol: inflace_init, ....\n", err);
+		exit(0);
+	}
+	return outSize;
+}
+
+unsigned long zlib_compress2(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level)
+{
+	unsigned long outSize;
+	
+	z_stream stream = {0};
+    int err;
+
+    stream.next_in = data;
+    stream.avail_in = dataLength;
+#ifdef MAXSEG_64K
+    /* Check for source > 64K on 16-bit machine: */
+    if ((uLong)stream.avail_in != dataLength) return Z_BUF_ERROR;
+#endif
+
+    uLong estCmpLen = deflateBound(&stream, dataLength);
+	*compressBytes = (unsigned char*)malloc(sizeof(unsigned char)*estCmpLen);
+
+    stream.next_out = *compressBytes;
+    stream.avail_out = estCmpLen;
+    //stream.avail_out = dataLength*10;
+    //if ((uLong)stream.avail_out != dataLength*10) return Z_BUF_ERROR;
+
+    stream.zalloc = (alloc_func)0;
+    stream.zfree = (free_func)0;
+    stream.opaque = (voidpf)0;
+//	stream.data_type = Z_TEXT;
+
+    //err = deflateInit(&stream, level); //default  windowBits == 15.
+    int windowBits = 14; //8-15
+    if(confparams_cpr->szMode==SZ_BEST_COMPRESSION)
+		windowBits = 15;
+	
+    err = deflateInit2(&stream, level, Z_DEFLATED, windowBits, DEF_MEM_LEVEL,
+                         Z_DEFAULT_STRATEGY);//Z_FIXED); //Z_DEFAULT_STRATEGY
+    if (err != Z_OK) return err;
+
+    err = deflate(&stream, Z_FINISH);
+    if (err != Z_STREAM_END) {
+        deflateEnd(&stream);
+        return err == Z_OK ? Z_BUF_ERROR : err;
+    }
+
+    err = deflateEnd(&stream);
+    
+    outSize = stream.total_out;
+    return outSize;
+}
+
+unsigned long zlib_compress3(unsigned char* data, unsigned long dataLength, unsigned char* compressBytes, int level)
+{
+	unsigned long outSize = 0;
+
+	z_stream stream = {0};
+    int err;
+
+    stream.next_in = data;
+    stream.avail_in = dataLength;
+#ifdef MAXSEG_64K
+    /* Check for source > 64K on 16-bit machine: */
+    if ((uLong)stream.avail_in != dataLength) return Z_BUF_ERROR;
+#endif
+
+    stream.next_out = compressBytes;
+    stream.avail_out = dataLength;
+    stream.zalloc = (alloc_func)0;
+    stream.zfree = (free_func)0;
+    stream.opaque = (voidpf)0;
+
+    //err = deflateInit(&stream, level); //default  windowBits == 15.
+    int windowBits = 14; //8-15
+    if(confparams_cpr->szMode==SZ_BEST_COMPRESSION)
+		windowBits = 15;
+
+    err = deflateInit2(&stream, level, Z_DEFLATED, windowBits, DEF_MEM_LEVEL,
+                         Z_DEFAULT_STRATEGY);//Z_FIXED); //Z_DEFAULT_STRATEGY
+    if (err != Z_OK) return err;
+
+    err = deflate(&stream, Z_FINISH);
+    if (err != Z_STREAM_END) {
+        deflateEnd(&stream);
+        return err == Z_OK ? Z_BUF_ERROR : err;
+    }
+
+    err = deflateEnd(&stream);
+
+    outSize = stream.total_out;
+    return outSize;
+}
+
+unsigned long zlib_compress4(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level)
+{
+    z_stream c_stream = {0}; /* compression stream */
+    int err = 0;
+
+    c_stream.zalloc = (alloc_func)0;
+    c_stream.zfree = (free_func)0;
+    c_stream.opaque = (voidpf)0;
+
+    int windowBits = 14; //8-15
+    if(confparams_cpr->szMode==SZ_BEST_COMPRESSION)
+		windowBits = 15;
+    
+    err = deflateInit2(&c_stream, level, Z_DEFLATED, windowBits, DEF_MEM_LEVEL,
+                         Z_DEFAULT_STRATEGY);//Z_FIXED); //Z_DEFAULT_STRATEGY
+    CHECK_ERR(err, "deflateInit");
+
+    uLong estCmpLen = deflateBound(&c_stream, dataLength);
+	*compressBytes = (unsigned char*)malloc(sizeof(unsigned char)*estCmpLen);	
+
+    c_stream.next_in  = data;
+    c_stream.next_out = *compressBytes;
+
+    while (c_stream.total_in < dataLength && c_stream.total_out < estCmpLen) {
+        c_stream.avail_in = c_stream.avail_out = SZ_ZLIB_BUFFER_SIZE; /* force small buffers */
+        err = deflate(&c_stream, Z_NO_FLUSH);
+        CHECK_ERR(err, "deflate");
+    }
+    /* Finish the stream, still forcing small buffers: */
+    for (;;) {
+        c_stream.avail_out = 1;
+        err = deflate(&c_stream, Z_FINISH);
+        if (err == Z_STREAM_END) break;
+        CHECK_ERR(err, "deflate");
+    }
+
+    err = deflateEnd(&c_stream);
+    CHECK_ERR(err, "deflateEnd");
+    
+    return c_stream.total_out;	
+}
+
+unsigned long zlib_compress5(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level)
+{
+	int ret, flush;
+	unsigned have;
+	z_stream strm;
+	unsigned char* in = data;
+
+	/* allocate deflate state */
+	strm.zalloc = Z_NULL;
+	strm.zfree = Z_NULL;
+	strm.opaque = Z_NULL;
+	ret = deflateInit(&strm, level);
+	//int windowBits = 15;
+    //ret = deflateInit2(&strm, level, Z_DEFLATED, windowBits, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);//Z_FIXED); //Z_DEFAULT_STRATEGY
+
+	if (ret != Z_OK)
+		return ret;
+
+	size_t p_size = 0, av_in = 0;
+    uLong estCmpLen = deflateBound(&strm, dataLength);
+   	*compressBytes = (unsigned char*)malloc(sizeof(unsigned char)*estCmpLen);	
+	unsigned char* out = *compressBytes; 
+
+	/* compress until end of file */
+	do {		
+		p_size += SZ_ZLIB_BUFFER_SIZE;
+		if(p_size>=dataLength)
+		{
+			av_in = dataLength - (p_size - SZ_ZLIB_BUFFER_SIZE);
+			flush = Z_FINISH;
+		}
+		else
+		{
+			av_in = SZ_ZLIB_BUFFER_SIZE;
+			flush = Z_NO_FLUSH;
+		}
+		strm.avail_in = av_in;
+		strm.next_in = in;
+
+		/* run deflate() on input until output buffer not full, finish
+		   compression if all of source has been read in */
+		do {
+			strm.avail_out = SZ_ZLIB_BUFFER_SIZE;
+			strm.next_out = out;
+			ret = deflate(&strm, flush);    /* no bad return value */
+
+			have = SZ_ZLIB_BUFFER_SIZE - strm.avail_out;
+			out += have;
+		} while (strm.avail_out == 0);
+
+		in+=av_in;
+
+		/* done when last data in file processed */
+	} while (flush != Z_FINISH);
+
+	/* clean up and return */
+	(void)deflateEnd(&strm);	
+	
+	return strm.total_out;	
+}
+
+unsigned long zlib_uncompress(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+	unsigned long outSize = targetOriSize;
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);	
+	int status = uncompress(*oriData, &outSize, compressBytes, cmpSize); 
+	if(status!=Z_OK)
+	{
+		printf("Error: Zlib decompression error; status=%d\n", status);
+		exit(0);
+	}
+	
+	return outSize;
+}
+
+unsigned long zlib_uncompress2 (unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+    z_stream stream = {0};
+
+	unsigned long outSize;
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);
+
+    stream.zalloc = Z_NULL;
+    stream.zfree = Z_NULL;
+    stream.opaque = Z_NULL;
+//	stream.data_type = Z_TEXT;
+
+    stream.next_in = compressBytes;
+    stream.avail_in = cmpSize;
+    /* Check for source > 64K on 16-bit machine: */
+    if ((unsigned long)stream.avail_in != cmpSize) 
+    {
+		printf("Error: zlib_uncompress2: stream.avail_in != cmpSize");
+		//exit(1);
+		return SZ_NSCS; //-1
+	}
+
+    stream.next_out = *oriData;
+    stream.avail_out = targetOriSize;
+    //if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR;
+
+    int err = inflateInit(&stream);
+    //int windowBits = 15;
+    //int err = inflateInit2(&stream, windowBits);
+    if (err != Z_OK)
+    {
+		printf("Error: zlib_uncompress2: err != Z_OK\n");
+		return SZ_NSCS;
+	}
+
+    err = inflate(&stream, Z_FINISH);
+    if (err != Z_STREAM_END) {
+        inflateEnd(&stream);
+        if (err == Z_NEED_DICT || (err == Z_BUF_ERROR && stream.avail_in == 0))
+            return Z_DATA_ERROR;
+        return err;
+    }
+    outSize = stream.total_out;
+    inflateEnd(&stream);
+    return outSize;
+}
+
+unsigned long zlib_uncompress3(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+	int status;
+	z_stream z_strm; /* decompression stream */
+	
+	size_t nalloc = 65536*4;
+
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);		
+	memset(&z_strm, 0, sizeof(z_strm));
+
+
+    /*d_stream.zalloc = (alloc_func)0;
+    d_stream.zfree = (free_func)0;
+    d_stream.opaque = (voidpf)0;*/
+
+	z_strm.next_in  = compressBytes;
+	z_strm.avail_in = 0;
+	z_strm.next_out = *oriData;
+	z_strm.avail_out = targetOriSize;
+	
+	status = inflateInit(&z_strm);
+	CHECK_ERR(status, "inflateInit");
+	
+	do{
+		z_strm.avail_in = z_strm.avail_out = SZ_ZLIB_BUFFER_SIZE; /* force small buffers */		
+		/* Uncompress some data */
+		status = inflate(&z_strm, Z_SYNC_FLUSH);
+		
+		/* Check if we are done uncompressing data */
+		if (Z_STREAM_END==status)
+			break;  /*done*/				
+
+		if (Z_OK!=status) {
+			(void)inflateEnd(&z_strm);
+			printf("Error: inflate() failed\n");
+			exit(0);
+		}	
+		else
+		{
+			/* If we're not done and just ran out of buffer space, get more */
+			if(0 == z_strm.avail_out) {
+				void *new_outbuf;         /* Pointer to new output buffer */
+
+				/* Allocate a buffer twice as big */
+				nalloc *= 2;
+				if(NULL == (new_outbuf = realloc(*oriData, nalloc))) {
+					(void)inflateEnd(&z_strm);
+					printf("Error: memory allocation failed for deflate uncompression\n");
+					exit(0);
+				} /* end if */
+				*oriData = new_outbuf;
+
+				/* Update pointers to buffer for next set of uncompressed data */
+				z_strm.next_out = (*oriData) + z_strm.total_out;
+				z_strm.avail_out = (uInt)(nalloc - z_strm.total_out);
+			} /* end if */			
+		} /* end else*/
+	}while(status==Z_OK);
+
+	status = inflateEnd(&z_strm);
+	CHECK_ERR(status, "inflateEnd");
+
+	return z_strm.total_out;
+}
+
+unsigned long zlib_uncompress4(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+    int ret;
+    unsigned int have;
+    z_stream strm;
+    unsigned char *in = compressBytes;
+    unsigned char *out;
+
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);		
+	out = *oriData;
+
+    /* allocate inflate state */
+    strm.zalloc = Z_NULL;
+    strm.zfree = Z_NULL;
+    strm.opaque = Z_NULL;
+    strm.avail_in = 0;
+    strm.next_in = Z_NULL;
+    ret = inflateInit(&strm);
+    if (ret != Z_OK)
+	{
+        return ret;
+	}
+
+	size_t p_size = 0, av_in = 0;
+    /* decompress until deflate stream ends or end of file */
+    do {
+		p_size += SZ_ZLIB_BUFFER_SIZE;
+		if(p_size>cmpSize)
+			av_in = cmpSize - (p_size - SZ_ZLIB_BUFFER_SIZE);
+		else
+			av_in = SZ_ZLIB_BUFFER_SIZE;
+		strm.avail_in = av_in;
+        
+        if (strm.avail_in == 0)
+            break;
+        strm.next_in = in;
+
+        /* run inflate() on input until output buffer not full */
+        do {
+            strm.avail_out = SZ_ZLIB_BUFFER_SIZE;
+            strm.next_out = out;
+            ret = inflate(&strm, Z_NO_FLUSH);
+            //assert(ret != Z_STREAM_ERROR);  /* state not clobbered */
+            switch (ret) {
+            case Z_NEED_DICT:
+                ret = Z_DATA_ERROR;     /* and fall through */
+            case Z_DATA_ERROR:
+            case Z_MEM_ERROR:
+                (void)inflateEnd(&strm);
+                return ret;
+            }
+            have = SZ_ZLIB_BUFFER_SIZE - strm.avail_out;
+            
+            out += have;
+
+        } while (strm.avail_out == 0);
+		
+		in+=av_in;
+        /* done when inflate() says it's done */
+    } while (ret != Z_STREAM_END);
+
+    /* clean up and return */
+    (void)inflateEnd(&strm);
+    
+    return strm.total_out;	
+}
+
+unsigned long zlib_uncompress65536bytes(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData)
+{
+	int err;
+	unsigned long targetOriSize = 65536;
+	z_stream d_stream = {0}; /* decompression stream */
+
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);
+
+    d_stream.zalloc = (alloc_func)0;
+    d_stream.zfree = (free_func)0;
+    d_stream.opaque = (voidpf)0;
+
+	d_stream.next_in  = compressBytes;
+	d_stream.avail_in = 0;
+	d_stream.next_out = *oriData;
+
+	err = inflateInit(&d_stream);
+	CHECK_ERR(err, "inflateInit");
+
+	while (d_stream.total_out < targetOriSize && d_stream.total_in < cmpSize) {
+		d_stream.avail_in = d_stream.avail_out = SZ_ZLIB_BUFFER_SIZE; /* force small buffers */
+		//err = inflate(&d_stream, Z_NO_FLUSH);
+		err = inflate(&d_stream, Z_SYNC_FLUSH);
+		if (err == Z_STREAM_END) break;
+		if(err<0)
+			break;
+	}
+	
+	if(err<0)
+		return d_stream.total_out;
+	err = inflateEnd(&d_stream);
+	
+	CHECK_ERR(err, "inflateEnd");
+
+	return d_stream.total_out;
+}
+
+unsigned long zlib_uncompress5(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+	int err;
+	z_stream d_stream = {0}; /* decompression stream */
+
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);		
+
+    d_stream.zalloc = (alloc_func)0;
+    d_stream.zfree = (free_func)0;
+    d_stream.opaque = (voidpf)0;
+
+	d_stream.next_in  = compressBytes;
+	d_stream.avail_in = 0;
+	d_stream.next_out = *oriData;
+
+	err = inflateInit(&d_stream);
+	CHECK_ERR(err, "inflateInit");
+
+	while (d_stream.total_out < targetOriSize && d_stream.total_in < cmpSize) {
+		d_stream.avail_in = d_stream.avail_out = SZ_ZLIB_BUFFER_SIZE; /* force small buffers */
+		//err = inflate(&d_stream, Z_NO_FLUSH);
+		err = inflate(&d_stream, Z_SYNC_FLUSH);
+		if (err == Z_STREAM_END) break;
+		CHECK_ERR(err, "inflate");
+	}
+	
+	err = inflateEnd(&d_stream);
+	
+	CHECK_ERR(err, "inflateEnd");
+
+	return d_stream.total_out;
+}
--- a/deps/SZ/sz/src/conf.c
+++ b/deps/SZ/sz/src/conf.c
@ -0,0 +1,459 @@
+/**
+ *  @file   conf.c
+ *  @author Sheng Di (sdi1@anl.gov or disheng222@gmail.com)
+ *  @date   2015.
+ *  @brief  Configuration loading functions for the SZ library.
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <math.h>
+#include "string.h"
+#include "sz.h"
+#include "iniparser.h"
+#include "Huffman.h"
+#include "pastri.h"
+
+/*-------------------------------------------------------------------------*/
+/**
+    @brief      It reads the configuration given in the configuration file.
+    @return     integer         1 if successfull.
+
+    This function reads the configuration given in the SZ configuration
+    file and sets other required parameters.
+
+ **/
+ 
+/*struct node_t *pool;
+node *qqq;
+node *qq;
+int n_nodes = 0, qend;
+unsigned long **code;
+unsigned char *cout;
+int n_inode;*/ 
+ 
+unsigned int roundUpToPowerOf2(unsigned int base)
+{
+  base -= 1;
+
+  base = base | (base >> 1);
+  base = base | (base >> 2);
+  base = base | (base >> 4);
+  base = base | (base >> 8);
+  base = base | (base >> 16);
+
+  return base + 1;
+} 
+ 
+void updateQuantizationInfo(int quant_intervals)
+{
+	exe_params->intvCapacity = quant_intervals;
+	exe_params->intvRadius = quant_intervals/2;
+} 
+ 
+double computeABSErrBoundFromPSNR(double psnr, double threshold, double value_range)
+{
+	double v1 = psnr + 10 * log10(1-2.0/3.0*threshold);
+	double v2 = v1/(-20);
+	double v3 = pow(10, v2);
+	return value_range * v3;
+} 
+
+double computeABSErrBoundFromNORM_ERR(double normErr, size_t nbEle)
+{
+	return sqrt(3.0/nbEle)*normErr;
+} 
+
+ 
+/*-------------------------------------------------------------------------*/
+/**
+ * 
+ * 
+ * @return the status of loading conf. file: 1 (success) or 0 (error code);
+ * */
+int SZ_ReadConf(const char* sz_cfgFile) {
+    // Check access to SZ configuration file and load dictionary
+    //record the setting in confparams_cpr
+    confparams_cpr = (sz_params*)malloc(sizeof(sz_params));    
+    exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+    
+    int x = 1;
+    char sol_name[256];
+    char *modeBuf;
+    char *errBoundMode;
+    char *endianTypeString;
+    dictionary *ini;
+    char *par;
+
+	char *y = (char*)&x;
+	
+	if(*y==1)
+		sysEndianType = LITTLE_ENDIAN_SYSTEM;
+	else //=0
+		sysEndianType = BIG_ENDIAN_SYSTEM;
+    
+    confparams_cpr->plus_bits = 3;
+    
+    if(sz_cfgFile == NULL)
+    {
+		dataEndianType = LITTLE_ENDIAN_DATA;
+		confparams_cpr->sol_ID = SZ;
+		confparams_cpr->max_quant_intervals = 65536;
+		confparams_cpr->maxRangeRadius = confparams_cpr->max_quant_intervals/2;
+				
+		exe_params->intvCapacity = confparams_cpr->maxRangeRadius*2;
+		exe_params->intvRadius = confparams_cpr->maxRangeRadius;
+		
+		confparams_cpr->quantization_intervals = 0;
+		exe_params->optQuantMode = 1;
+		confparams_cpr->predThreshold = 0.99;
+		confparams_cpr->sampleDistance = 100;
+		
+		confparams_cpr->szMode = SZ_BEST_COMPRESSION;
+		confparams_cpr->losslessCompressor = ZSTD_COMPRESSOR; //other option: GZIP_COMPRESSOR;
+		if(confparams_cpr->losslessCompressor==ZSTD_COMPRESSOR)
+			confparams_cpr->gzipMode = 3; //fast mode
+		else
+			confparams_cpr->gzipMode = 1; //high speed mode
+		
+		confparams_cpr->errorBoundMode = PSNR;
+		confparams_cpr->psnr = 90;
+		confparams_cpr->absErrBound = 1E-4;
+		confparams_cpr->relBoundRatio = 1E-4;
+		confparams_cpr->accelerate_pw_rel_compression = 1;
+		
+		confparams_cpr->pw_relBoundRatio = 1E-3;
+		confparams_cpr->segment_size = 36;
+		
+		confparams_cpr->pwr_type = SZ_PWR_MIN_TYPE;
+		
+		confparams_cpr->snapshotCmprStep = 5;
+		
+		confparams_cpr->withRegression = SZ_WITH_LINEAR_REGRESSION;
+	
+		confparams_cpr->randomAccess = 0; //0: no random access , 1: support random access
+	
+		confparams_cpr->protectValueRange = 0;
+	
+		return SZ_SCES;
+	}
+    
+    if (access(sz_cfgFile, F_OK) != 0)
+    {
+        printf("[SZ] Configuration file NOT accessible.\n");
+        return SZ_NSCS;
+    }
+    
+    //printf("[SZ] Reading SZ configuration file (%s) ...\n", sz_cfgFile);    
+    ini = iniparser_load(sz_cfgFile);
+    if (ini == NULL)
+    {
+        printf("[SZ] Iniparser failed to parse the conf. file.\n");
+        return SZ_NSCS;
+    }
+
+	endianTypeString = iniparser_getstring(ini, "ENV:dataEndianType", "LITTLE_ENDIAN_DATA");
+	if(strcmp(endianTypeString, "LITTLE_ENDIAN_DATA")==0)
+		dataEndianType = LITTLE_ENDIAN_DATA;
+	else if(strcmp(endianTypeString, "BIG_ENDIAN_DATA")==0)
+		dataEndianType = BIG_ENDIAN_DATA;
+	else
+	{
+		printf("Error: Wrong dataEndianType: please set it correctly in sz.config.\n");
+		iniparser_freedict(ini);
+		return SZ_NSCS;
+	}
+
+	// Reading/setting detection parameters
+	
+	par = iniparser_getstring(ini, "ENV:sol_name", NULL);
+	snprintf(sol_name, 256, "%s", par);
+	
+    if(strcmp(sol_name, "SZ")==0)
+		confparams_cpr->sol_ID = SZ;
+	else if(strcmp(sol_name, "PASTRI")==0)
+		confparams_cpr->sol_ID = PASTRI;
+	else if(strcmp(sol_name, "SZ_Transpose")==0)
+		confparams_cpr->sol_ID = SZ_Transpose;
+	else{
+		printf("[SZ] Error: wrong solution name (please check sz.config file), sol=%s\n", sol_name);
+		iniparser_freedict(ini);
+		return SZ_NSCS;
+	}
+	
+	if(confparams_cpr->sol_ID==SZ || confparams_cpr->sol_ID==SZ_Transpose)
+	{
+		int max_quant_intervals = iniparser_getint(ini, "PARAMETER:max_quant_intervals", 65536);
+		confparams_cpr->max_quant_intervals = max_quant_intervals;
+		
+		int quantization_intervals = (int)iniparser_getint(ini, "PARAMETER:quantization_intervals", 0);
+		confparams_cpr->quantization_intervals = quantization_intervals;
+		if(quantization_intervals>0)
+		{
+			updateQuantizationInfo(quantization_intervals);
+			confparams_cpr->max_quant_intervals = max_quant_intervals = quantization_intervals;
+			exe_params->optQuantMode = 0;
+		}
+		else //==0
+		{
+			confparams_cpr->maxRangeRadius = max_quant_intervals/2;
+
+			exe_params->intvCapacity = confparams_cpr->maxRangeRadius*2;
+			exe_params->intvRadius = confparams_cpr->maxRangeRadius;
+			
+			exe_params->optQuantMode = 1;
+		}
+		
+		if(quantization_intervals%2!=0)
+		{
+			printf("Error: quantization_intervals must be an even number!\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;
+		}
+		
+		confparams_cpr->predThreshold = (float)iniparser_getdouble(ini, "PARAMETER:predThreshold", 0);
+		confparams_cpr->sampleDistance = (int)iniparser_getint(ini, "PARAMETER:sampleDistance", 0);
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:szMode", NULL);
+		if(modeBuf==NULL)
+		{
+			printf("[SZ] Error: Null szMode setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;					
+		}
+		else if(strcmp(modeBuf, "SZ_BEST_SPEED")==0)
+			confparams_cpr->szMode = SZ_BEST_SPEED;
+		else if(strcmp(modeBuf, "SZ_DEFAULT_COMPRESSION")==0)
+			confparams_cpr->szMode = SZ_DEFAULT_COMPRESSION;
+		else if(strcmp(modeBuf, "SZ_BEST_COMPRESSION")==0)
+			confparams_cpr->szMode = SZ_BEST_COMPRESSION;
+		else
+		{
+			printf("[SZ] Error: Wrong szMode setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;	
+		}
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:losslessCompressor", "ZSTD_COMPRESSOR");
+		if(strcmp(modeBuf, "GZIP_COMPRESSOR")==0)
+			confparams_cpr->losslessCompressor = GZIP_COMPRESSOR;
+		else if(strcmp(modeBuf, "ZSTD_COMPRESSOR")==0)
+			confparams_cpr->losslessCompressor = ZSTD_COMPRESSOR;
+		else
+		{
+			printf("[SZ] Error: Wrong losslessCompressor setting (please check sz.config file)\n");\
+			printf("No Such a lossless compressor: %s\n", modeBuf);
+			iniparser_freedict(ini);
+			return SZ_NSCS;	
+		}		
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:withLinearRegression", "YES");
+		if(strcmp(modeBuf, "YES")==0 || strcmp(modeBuf, "yes")==0)
+			confparams_cpr->withRegression = SZ_WITH_LINEAR_REGRESSION;
+		else
+			confparams_cpr->withRegression = SZ_NO_REGRESSION;
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:gzipMode", "Gzip_BEST_SPEED");
+		if(modeBuf==NULL)
+		{
+			printf("[SZ] Error: Null Gzip mode setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;					
+		}		
+		else if(strcmp(modeBuf, "Gzip_NO_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 0;
+		else if(strcmp(modeBuf, "Gzip_BEST_SPEED")==0)
+			confparams_cpr->gzipMode = 1;
+		else if(strcmp(modeBuf, "Gzip_BEST_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 9;
+		else if(strcmp(modeBuf, "Gzip_DEFAULT_COMPRESSION")==0)
+			confparams_cpr->gzipMode = -1;
+		else
+		{
+			printf("[SZ] Error: Wrong gzip Mode (please check sz.config file)\n");
+			return SZ_NSCS;
+		}
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:zstdMode", "Zstd_HIGH_SPEED");		
+		if(modeBuf==NULL)
+		{
+			printf("[SZ] Error: Null Zstd mode setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;					
+		}		
+		else if(strcmp(modeBuf, "Zstd_BEST_SPEED")==0)
+			confparams_cpr->gzipMode = 1;
+		else if(strcmp(modeBuf, "Zstd_HIGH_SPEED")==0)
+			confparams_cpr->gzipMode = 3;
+		else if(strcmp(modeBuf, "Zstd_HIGH_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 19;
+		else if(strcmp(modeBuf, "Zstd_BEST_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 22;			
+		else if(strcmp(modeBuf, "Zstd_DEFAULT_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 3;
+		else
+		{
+			printf("[SZ] Error: Wrong zstd Mode (please check sz.config file)\n");
+			return SZ_NSCS;
+		}		
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:protectValueRange", "YES");
+		if(strcmp(modeBuf, "YES")==0)
+			confparams_cpr->protectValueRange = 1;
+		else
+			confparams_cpr->protectValueRange = 0;
+		
+		confparams_cpr->randomAccess = (int)iniparser_getint(ini, "PARAMETER:randomAccess", 0);
+		
+		//TODO
+		confparams_cpr->snapshotCmprStep = (int)iniparser_getint(ini, "PARAMETER:snapshotCmprStep", 5);
+				
+		errBoundMode = iniparser_getstring(ini, "PARAMETER:errorBoundMode", NULL);
+		if(errBoundMode==NULL)
+		{
+			printf("[SZ] Error: Null error bound setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;				
+		}
+		else if(strcmp(errBoundMode,"ABS")==0||strcmp(errBoundMode,"abs")==0)
+			confparams_cpr->errorBoundMode=ABS;
+		else if(strcmp(errBoundMode, "REL")==0||strcmp(errBoundMode,"rel")==0)
+			confparams_cpr->errorBoundMode=REL;
+		else if(strcmp(errBoundMode, "VR_REL")==0||strcmp(errBoundMode, "vr_rel")==0)
+			confparams_cpr->errorBoundMode=REL;
+		else if(strcmp(errBoundMode, "ABS_AND_REL")==0||strcmp(errBoundMode, "abs_and_rel")==0)
+			confparams_cpr->errorBoundMode=ABS_AND_REL;
+		else if(strcmp(errBoundMode, "ABS_OR_REL")==0||strcmp(errBoundMode, "abs_or_rel")==0)
+			confparams_cpr->errorBoundMode=ABS_OR_REL;
+		else if(strcmp(errBoundMode, "PW_REL")==0||strcmp(errBoundMode, "pw_rel")==0)
+			confparams_cpr->errorBoundMode=PW_REL;
+		else if(strcmp(errBoundMode, "PSNR")==0||strcmp(errBoundMode, "psnr")==0)
+			confparams_cpr->errorBoundMode=PSNR;
+		else if(strcmp(errBoundMode, "ABS_AND_PW_REL")==0||strcmp(errBoundMode, "abs_and_pw_rel")==0)
+			confparams_cpr->errorBoundMode=ABS_AND_PW_REL;
+		else if(strcmp(errBoundMode, "ABS_OR_PW_REL")==0||strcmp(errBoundMode, "abs_or_pw_rel")==0)
+			confparams_cpr->errorBoundMode=ABS_OR_PW_REL;
+		else if(strcmp(errBoundMode, "REL_AND_PW_REL")==0||strcmp(errBoundMode, "rel_and_pw_rel")==0)
+			confparams_cpr->errorBoundMode=REL_AND_PW_REL;
+		else if(strcmp(errBoundMode, "REL_OR_PW_REL")==0||strcmp(errBoundMode, "rel_or_pw_rel")==0)
+			confparams_cpr->errorBoundMode=REL_OR_PW_REL;
+		else if(strcmp(errBoundMode, "NORM")==0||strcmp(errBoundMode, "norm")==0)
+			confparams_cpr->errorBoundMode=NORM;
+		else
+		{
+			printf("[SZ] Error: Wrong error bound mode (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;
+		}
+		
+		confparams_cpr->absErrBound = (double)iniparser_getdouble(ini, "PARAMETER:absErrBound", 0);
+		confparams_cpr->relBoundRatio = (double)iniparser_getdouble(ini, "PARAMETER:relBoundRatio", 0);
+		confparams_cpr->psnr = (double)iniparser_getdouble(ini, "PARAMETER:psnr", 0);
+		confparams_cpr->normErr = (double)iniparser_getdouble(ini, "PARAMETER:normErr", 0);
+		confparams_cpr->pw_relBoundRatio = (double)iniparser_getdouble(ini, "PARAMETER:pw_relBoundRatio", 0);
+		confparams_cpr->segment_size = (int)iniparser_getint(ini, "PARAMETER:segment_size", 0);
+		confparams_cpr->accelerate_pw_rel_compression = (int)iniparser_getint(ini, "PARAMETER:accelerate_pw_rel_compression", 1);
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:pwr_type", "MIN");
+		
+		if(strcmp(modeBuf, "MIN")==0)
+			confparams_cpr->pwr_type = SZ_PWR_MIN_TYPE;
+		else if(strcmp(modeBuf, "AVG")==0)
+			confparams_cpr->pwr_type = SZ_PWR_AVG_TYPE;
+		else if(strcmp(modeBuf, "MAX")==0)
+			confparams_cpr->pwr_type = SZ_PWR_MAX_TYPE;
+		else if(modeBuf!=NULL)
+		{
+			printf("[SZ] Error: Wrong pwr_type setting (please check sz.config file).\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;	
+		}
+		else //by default
+			confparams_cpr->pwr_type = SZ_PWR_AVG_TYPE;
+    
+		//initialization for Huffman encoding
+		//SZ_Reset();	
+	}
+	else if(confparams_cpr->sol_ID == PASTRI)
+	{//load parameters for PSTRI
+		pastri_par.bf[0] = (int)iniparser_getint(ini, "PARAMETER:basisFunction_0", 0);		
+		pastri_par.bf[1] = (int)iniparser_getint(ini, "PARAMETER:basisFunction_1", 0);		
+		pastri_par.bf[2] = (int)iniparser_getint(ini, "PARAMETER:basisFunction_2", 0);		
+		pastri_par.bf[3] = (int)iniparser_getint(ini, "PARAMETER:basisFunction_3", 0);
+		pastri_par.numBlocks = (int)iniparser_getint(ini, "PARAMETER:numBlocks", 0);		
+		confparams_cpr->absErrBound = pastri_par.originalEb = (double)iniparser_getdouble(ini, "PARAMETER:absErrBound", 1E-3);
+	}
+	
+    iniparser_freedict(ini);
+    return SZ_SCES;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+    @brief      It reads and tests the configuration given.
+    @return     integer         1 if successfull.
+
+    This function reads the configuration file. Then test that the
+    configuration parameters are correct (including directories).
+
+ **/
+/*-------------------------------------------------------------------------*/
+int SZ_LoadConf(const char* sz_cfgFile) {
+    int res = SZ_ReadConf(sz_cfgFile);
+    if (res != SZ_SCES)
+    {
+        printf("[SZ] ERROR: Impossible to read configuration.\n");
+        return SZ_NSCS;
+    }
+    return SZ_SCES;
+}
+
+int checkVersion(char* version)
+{
+	int i = 0;
+	for(;i<3;i++)
+		if(version[i]!=versionNumber[i])
+			return 0;
+	return 1;
+}
+
+inline int computeVersion(int major, int minor, int revision)
+{
+	return major*10000+minor*100+revision;
+}
+
+int checkVersion2(char* version)
+{
+	int major = version[0];
+	int minor = version[1];
+	int revision = version[2];
+	
+	int preVersion = 20108;
+	int givenVersion = computeVersion(major, minor, revision);
+	//int currentVersion = computeVersion(SZ_VER_MAJOR, SZ_VER_MINOR, SZ_VER_REVISION);
+	if(givenVersion < preVersion) //only for old version (older than 2.1.8), we will check whether version is consistent exactly.
+		return checkVersion(version);
+	return 1;
+}
+
+void initSZ_TSC()
+{
+	sz_tsc = (sz_tsc_metadata*)malloc(sizeof(sz_tsc_metadata));
+	memset(sz_tsc, 0, sizeof(sz_tsc_metadata));
+	/*sprintf(sz_tsc->metadata_filename, "sz_tsc_metainfo.txt");
+	sz_tsc->metadata_file = fopen(sz_tsc->metadata_filename, "wb");
+	if (sz_tsc->metadata_file == NULL)
+	{
+		printf("Failed to open sz_tsc_metainfo.txt file for writing metainfo.\n");
+		exit(1);
+	}
+	fputs("#metadata of the time-step based compression\n", sz_tsc->metadata_file);	*/
+}
+
+/*double fabs(double value)
+{
+	if(value<0)
+		return -value;
+	else
+		return value;
+}*/
--- a/deps/SZ/sz/src/dataCompression.c
+++ b/deps/SZ/sz/src/dataCompression.c
@ -0,0 +1,980 @@
+/**
+ *  @file double_compression.c
+ *  @author Sheng Di, Dingwen Tao, Xin Liang, Xiangyu Zou, Tao Lu, Wen Xia, Xuan Wang, Weizhe Zhang
+ *  @date April, 2016
+ *  @brief Compression Technique for double array
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "sz.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageD.h"
+#include "CompressElement.h"
+#include "dataCompression.h"
+
+int computeByteSizePerIntValue(long valueRangeSize)
+{
+	if(valueRangeSize<=256)
+		return 1;
+	else if(valueRangeSize<=65536)
+		return 2;
+	else if(valueRangeSize<=4294967296) //2^32
+		return 4;
+	else
+		return 8;
+}
+
+long computeRangeSize_int(void* oriData, int dataType, size_t size, int64_t* valueRangeSize)
+{
+	size_t i = 0;
+	long max = 0, min = 0;
+
+	if(dataType==SZ_UINT8)
+	{
+		unsigned char* data = (unsigned char*)oriData;
+		unsigned char data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT8)
+	{
+		char* data = (char*)oriData;
+		char data_;
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT16)
+	{
+		unsigned short* data = (unsigned short*)oriData;
+		unsigned short data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT16)
+	{ 
+		short* data = (short*)oriData;
+		short data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT32)
+	{
+		unsigned int* data = (unsigned int*)oriData;
+		unsigned int data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT32)
+	{
+		int* data = (int*)oriData;
+		int data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT64)
+	{
+		unsigned long* data = (unsigned long*)oriData;
+		unsigned long data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT64)
+	{
+		long* data = (long *)oriData;
+		long data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+
+	*valueRangeSize = max - min;
+	return min;	
+}
+
+float computeRangeSize_float(float* oriData, size_t size, float* valueRangeSize, float* medianValue)
+{
+	size_t i = 0;
+	float min = oriData[0];
+	float max = min;
+	for(i=1;i<size;i++)
+	{
+		float data = oriData[i];
+		if(min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+float computeRangeSize_float_MSST19(float* oriData, size_t size, float* valueRangeSize, float* medianValue, unsigned char * signs, bool* positive, float* nearZero)
+{
+    size_t i = 0;
+    float min = oriData[0];
+    float max = min;
+    *nearZero = min;
+
+    for(i=1;i<size;i++)
+    {
+        float data = oriData[i];
+        if(data <0){
+            signs[i] = 1;
+            *positive = false;
+        }
+        if(oriData[i] != 0 && fabsf(oriData[i]) < fabsf(*nearZero)){
+            *nearZero = oriData[i];
+        }
+        if(min>data)
+            min = data;
+        else if(max<data)
+            max = data;
+    }
+
+    *valueRangeSize = max - min;
+    *medianValue = min + *valueRangeSize/2;
+    return min;
+}
+
+double computeRangeSize_double(double* oriData, size_t size, double* valueRangeSize, double* medianValue)
+{
+	size_t i = 0;
+	double min = oriData[0];
+	double max = min;
+	for(i=1;i<size;i++)
+	{
+		double data = oriData[i];
+		if(min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+	
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+double computeRangeSize_double_MSST19(double* oriData, size_t size, double* valueRangeSize, double* medianValue, unsigned char * signs, bool* positive, double* nearZero)
+{
+    size_t i = 0;
+    double min = oriData[0];
+    double max = min;
+    *nearZero = min;
+
+    for(i=1;i<size;i++)
+    {
+        double data = oriData[i];
+        if(data <0){
+            signs[i] = 1;
+            *positive = false;
+        }
+        if(oriData[i] != 0 && fabs(oriData[i]) < fabs(*nearZero)){
+            *nearZero = oriData[i];
+        }
+        if(min>data)
+            min = data;
+        else if(max<data)
+            max = data;
+    }
+
+    *valueRangeSize = max - min;
+    *medianValue = min + *valueRangeSize/2;
+    return min;
+}
+
+float computeRangeSize_float_subblock(float* oriData, float* valueRangeSize, float* medianValue,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1)
+{
+	size_t i1, i2, i3, i4, i5;
+	size_t index_start = s5*(r4*r3*r2*r1) + s4*(r3*r2*r1) + s3*(r2*r1) + s2*r1 + s1;
+	float min = oriData[index_start];
+	float max = min;
+
+	for (i5 = s5; i5 <= e5; i5++)
+	for (i4 = s4; i4 <= e4; i4++)
+	for (i3 = s3; i3 <= e3; i3++)
+	for (i2 = s2; i2 <= e2; i2++)
+	for (i1 = s1; i1 <= e1; i1++)
+	{
+		size_t index = i5*(r4*r3*r2*r1) + i4*(r3*r2*r1) + i3*(r2*r1) + i2*r1 + i1;
+		float data = oriData[index];
+		if (min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+
+double computeRangeSize_double_subblock(double* oriData, double* valueRangeSize, double* medianValue,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1)
+{
+	size_t i1, i2, i3, i4, i5;
+	size_t index_start = s5*(r4*r3*r2*r1) + s4*(r3*r2*r1) + s3*(r2*r1) + s2*r1 + s1;
+	double min = oriData[index_start];
+	double max = min;
+
+	for (i5 = s5; i5 <= e5; i5++)
+	for (i4 = s4; i4 <= e4; i4++)
+	for (i3 = s3; i3 <= e3; i3++)
+	for (i2 = s2; i2 <= e2; i2++)
+	for (i1 = s1; i1 <= e1; i1++)
+	{
+		size_t index = i5*(r4*r3*r2*r1) + i4*(r3*r2*r1) + i3*(r2*r1) + i2*r1 + i1;
+		double data = oriData[index];
+		if (min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+
+double min_d(double a, double b)
+{
+	if(a<b)
+		return a;
+	else
+		return b;
+}
+
+double max_d(double a, double b)
+{
+	if(a>b)
+		return a;
+	else
+		return b;
+}
+
+float min_f(float a, float b)
+{
+	if(a<b)
+		return a;
+	else
+		return b;
+}
+
+float max_f(float a, float b)
+{
+	if(a>b)
+		return a;
+	else
+		return b;
+}
+
+double getRealPrecision_double(double valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_d(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_d(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = 0;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+double getRealPrecision_float(float valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = 0;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+double getRealPrecision_int(long valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = -1;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+void symTransform_8bytes(unsigned char data[8])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[7];
+	data[7] = tmp;
+
+	tmp = data[1];
+	data[1] = data[6];
+	data[6] = tmp;
+	
+	tmp = data[2];
+	data[2] = data[5];
+	data[5] = tmp;
+	
+	tmp = data[3];
+	data[3] = data[4];
+	data[4] = tmp;
+}
+
+inline void symTransform_2bytes(unsigned char data[2])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[1];
+	data[1] = tmp;
+}
+
+inline void symTransform_4bytes(unsigned char data[4])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[3];
+	data[3] = tmp;
+
+	tmp = data[1];
+	data[1] = data[2];
+	data[2] = tmp;
+}
+
+inline void compressInt8Value(int8_t tgtValue, int8_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint8_t data = tgtValue - minValue;
+	memcpy(bytes, &data, byteSize); //byteSize==1
+}
+
+inline void compressInt16Value(int16_t tgtValue, int16_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint16_t data = tgtValue - minValue;
+	unsigned char tmpBytes[2];
+	int16ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 2 - byteSize, byteSize);
+}
+
+inline void compressInt32Value(int32_t tgtValue, int32_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint32_t data = tgtValue - minValue;
+	unsigned char tmpBytes[4];
+	int32ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 4 - byteSize, byteSize);
+}
+
+inline void compressInt64Value(int64_t tgtValue, int64_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint64_t data = tgtValue - minValue;
+	unsigned char tmpBytes[8];
+	int64ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 8 - byteSize, byteSize);
+}
+
+inline void compressUInt8Value(uint8_t tgtValue, uint8_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint8_t data = tgtValue - minValue;
+	memcpy(bytes, &data, byteSize); //byteSize==1
+}
+
+inline void compressUInt16Value(uint16_t tgtValue, uint16_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint16_t data = tgtValue - minValue;
+	unsigned char tmpBytes[2];
+	int16ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 2 - byteSize, byteSize);
+}
+
+inline void compressUInt32Value(uint32_t tgtValue, uint32_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint32_t data = tgtValue - minValue;
+	unsigned char tmpBytes[4];
+	int32ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 4 - byteSize, byteSize);
+}
+
+inline void compressUInt64Value(uint64_t tgtValue, uint64_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint64_t data = tgtValue - minValue;
+	unsigned char tmpBytes[8];
+	int64ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 8 - byteSize, byteSize);
+}
+
+inline void compressSingleFloatValue(FloatValueCompressElement *vce, float tgtValue, float precision, float medianValue, 
+		int reqLength, int reqBytesLength, int resiBitsLength)
+{		
+	float normValue = tgtValue - medianValue;
+
+	lfloat lfBuf;
+	lfBuf.value = normValue;
+			
+	int ignBytesLength = 32 - reqLength;
+	if(ignBytesLength<0)
+		ignBytesLength = 0;
+	
+	int tmp_int = lfBuf.ivalue;
+	intToBytes_bigEndian(vce->curBytes, tmp_int);
+		
+	lfBuf.ivalue = (lfBuf.ivalue >> ignBytesLength) << ignBytesLength;
+	
+	//float tmpValue = lfBuf.value;
+	
+	vce->data = lfBuf.value+medianValue;
+	vce->curValue = tmp_int;
+	vce->reqBytesLength = reqBytesLength;
+	vce->resiBitsLength = resiBitsLength;
+}
+
+void compressSingleFloatValue_MSST19(FloatValueCompressElement *vce, float tgtValue, float precision, int reqLength, int reqBytesLength, int resiBitsLength)
+{
+    float normValue = tgtValue;
+
+    lfloat lfBuf;
+    lfBuf.value = normValue;
+
+    int ignBytesLength = 32 - reqLength;
+    if(ignBytesLength<0)
+        ignBytesLength = 0;
+
+    int tmp_int = lfBuf.ivalue;
+    intToBytes_bigEndian(vce->curBytes, tmp_int);
+
+    lfBuf.ivalue = (lfBuf.ivalue >> ignBytesLength) << ignBytesLength;
+
+    //float tmpValue = lfBuf.value;
+
+    vce->data = lfBuf.value;
+    vce->curValue = tmp_int;
+    vce->reqBytesLength = reqBytesLength;
+    vce->resiBitsLength = resiBitsLength;
+}
+
+void compressSingleDoubleValue_MSST19(DoubleValueCompressElement *vce, double tgtValue, double precision, int reqLength, int reqBytesLength, int resiBitsLength)
+{
+    ldouble lfBuf;
+    lfBuf.value = tgtValue;
+
+    int ignBytesLength = 64 - reqLength;
+    if(ignBytesLength<0)
+        ignBytesLength = 0;
+
+    long tmp_long = lfBuf.lvalue;
+    longToBytes_bigEndian(vce->curBytes, tmp_long);
+
+    lfBuf.lvalue = (lfBuf.lvalue >> ignBytesLength) << ignBytesLength;
+
+    //float tmpValue = lfBuf.value;
+
+    vce->data = lfBuf.value;
+    vce->curValue = tmp_long;
+    vce->reqBytesLength = reqBytesLength;
+    vce->resiBitsLength = resiBitsLength;
+}
+
+void compressSingleDoubleValue(DoubleValueCompressElement *vce, double tgtValue, double precision, double medianValue, 
+		int reqLength, int reqBytesLength, int resiBitsLength)
+{		
+	double normValue = tgtValue - medianValue;
+
+	ldouble lfBuf;
+	lfBuf.value = normValue;
+			
+	int ignBytesLength = 64 - reqLength;
+	if(ignBytesLength<0)
+		ignBytesLength = 0;
+
+	long tmp_long = lfBuf.lvalue;
+	longToBytes_bigEndian(vce->curBytes, tmp_long);
+				
+	lfBuf.lvalue = (lfBuf.lvalue >> ignBytesLength)<<ignBytesLength;
+	
+	//double tmpValue = lfBuf.value;
+	
+	vce->data = lfBuf.value+medianValue;
+	vce->curValue = tmp_long;
+	vce->reqBytesLength = reqBytesLength;
+	vce->resiBitsLength = resiBitsLength;
+}
+
+int compIdenticalLeadingBytesCount_double(unsigned char* preBytes, unsigned char* curBytes)
+{
+	int i, n = 0;
+	for(i=0;i<8;i++)
+		if(preBytes[i]==curBytes[i])
+			n++;
+		else
+			break;
+	if(n>3) n = 3;
+	return n;
+}
+
+inline int compIdenticalLeadingBytesCount_float(unsigned char* preBytes, unsigned char* curBytes)
+{
+	int i, n = 0;
+	for(i=0;i<4;i++)
+		if(preBytes[i]==curBytes[i])
+			n++;
+		else
+			break;
+	if(n>3) n = 3;
+	return n;
+}
+
+//TODO double-check the correctness...
+inline void addExactData(DynamicByteArray *exactMidByteArray, DynamicIntArray *exactLeadNumArray, 
+		DynamicIntArray *resiBitArray, LossyCompressionElement *lce)
+{
+	int i;
+	int leadByteLength = lce->leadingZeroBytes;
+	addDIA_Data(exactLeadNumArray, leadByteLength);
+	unsigned char* intMidBytes = lce->integerMidBytes;
+	int integerMidBytesLength = lce->integerMidBytes_Length;
+	int resMidBitsLength = lce->resMidBitsLength;
+	if(intMidBytes!=NULL||resMidBitsLength!=0)
+	{
+		if(intMidBytes!=NULL)
+			for(i = 0;i<integerMidBytesLength;i++)
+				addDBA_Data(exactMidByteArray, intMidBytes[i]);
+		if(resMidBitsLength!=0)
+			addDIA_Data(resiBitArray, lce->residualMidBits);
+	}
+}
+
+/**
+ * @deprecated
+ * @return: the length of the coefficient array.
+ * */
+int getPredictionCoefficients(int layers, int dimension, int **coeff_array, int *status)
+{
+	size_t size = 0;
+	switch(dimension)
+	{
+		case 1:
+			switch(layers)
+			{
+				case 1:
+					*coeff_array = (int*)malloc(sizeof(int));
+					(*coeff_array)[0] = 1;
+					size = 1;
+					break;
+				case 2:
+					*coeff_array = (int*)malloc(2*sizeof(int));
+					(*coeff_array)[0] = 2;
+					(*coeff_array)[1] = -1;
+					size = 2;
+					break;
+				case 3:
+					*coeff_array = (int*)malloc(3*sizeof(int));
+					(*coeff_array)[0] = 3;
+					(*coeff_array)[1] = -3;
+					(*coeff_array)[2] = 1;
+					break;
+			}	
+			break;
+		case 2:
+			switch(layers)
+			{
+				case 1:
+				
+					break;
+				case 2:
+				
+					break;
+				case 3:
+				
+					break;
+			}				
+			break;
+		case 3:
+			switch(layers)
+			{
+				case 1:
+				
+					break;
+				case 2:
+				
+					break;
+				case 3:
+				
+					break;
+			}			
+			break;
+		default:
+			printf("Error: dimension must be no greater than 3 in the current version.\n");
+			*status = SZ_DERR;
+	}
+	*status = SZ_SCES;
+	return size;
+}
+
+int computeBlockEdgeSize_2D(int segmentSize)
+{
+	int i = 1;
+	for(i=1; i<segmentSize;i++)
+	{
+		if(i*i>segmentSize)
+			break;
+	}
+	return i;
+	//return (int)(sqrt(segmentSize)+1);
+}
+
+int computeBlockEdgeSize_3D(int segmentSize)
+{
+	int i = 1;
+	for(i=1; i<segmentSize;i++)
+	{
+		if(i*i*i>segmentSize)
+			break;
+	}
+	return i;	
+	//return (int)(pow(segmentSize, 1.0/3)+1);
+}
+
+//convert random-access version based bytes to output bytes
+int initRandomAccessBytes(unsigned char* raBytes)
+{
+	int k = 0, i = 0;
+	for (i = 0; i < 3; i++)//3
+		raBytes[k++] = versionNumber[i];
+	int sameByte = 0x80; //indicating this is regression-based compression mode
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+	if(confparams_cpr->randomAccess)
+		sameByte = (unsigned char) (sameByte | 0x02); // 00000010, random access
+	//sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(confparams_cpr->protectValueRange)
+		sameByte = (unsigned char) (sameByte | 0x04); //00000100, protect value range
+
+	raBytes[k++] = sameByte;
+
+	convertSZParamsToBytes(confparams_cpr, &(raBytes[k]));
+	if(confparams_cpr->dataType==SZ_FLOAT)
+		k = k + MetaDataByteLength;
+	else if(confparams_cpr->dataType==SZ_DOUBLE)
+		k = k + MetaDataByteLength_double;
+
+	return k;
+}
+
+//The following functions are float-precision version of dealing with the unpredictable data points 
+int generateLossyCoefficients_float(float* oriData, double precision, size_t nbEle, int* reqBytesLength, int* resiBitsLength, float* medianValue, float* decData)
+{
+	float valueRangeSize;
+	
+	computeRangeSize_float(oriData, nbEle, &valueRangeSize, medianValue);
+	short radExpo = getExponent_float(valueRangeSize/2);
+	
+	int reqLength;
+	computeReqLength_float(precision, radExpo, &reqLength, medianValue);
+	
+	*reqBytesLength = reqLength/8;
+	*resiBitsLength = reqLength%8;
+	
+	size_t i = 0;
+	for(i = 0;i < nbEle;i++)
+	{
+		float normValue = oriData[i] - *medianValue;
+
+		lfloat lfBuf;
+		lfBuf.value = normValue;
+				
+		int ignBytesLength = 32 - reqLength;
+		if(ignBytesLength<0)
+			ignBytesLength = 0;
+			
+		lfBuf.ivalue = (lfBuf.ivalue >> ignBytesLength) << ignBytesLength;
+		
+		//float tmpValue = lfBuf.value;
+		
+		decData[i] = lfBuf.value + *medianValue;
+	}
+	return reqLength;
+}	
+		
+/**
+ * @param float* oriData: inplace argument (input / output)
+ * 
+ * */		
+int compressExactDataArray_float(float* oriData, double precision, size_t nbEle, unsigned char** leadArray, unsigned char** midArray, unsigned char** resiArray, 
+int reqLength, int reqBytesLength, int resiBitsLength, float medianValue)
+{
+	//allocate memory for coefficient compression arrays
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	unsigned char preDataBytes[4] = {0,0,0,0};	
+
+	//allocate memory for vce and lce
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));	
+
+	size_t i = 0;
+	for(i = 0;i < nbEle;i++)
+	{
+		compressSingleFloatValue(vce, oriData[i], precision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		oriData[i] = vce->data;
+	}
+	convertDIAtoInts(exactLeadNumArray, leadArray);
+	convertDBAtoBytes(exactMidByteArray,midArray);
+	convertDIAtoInts(resiBitArray, resiArray);
+
+	size_t midArraySize = exactMidByteArray->size;
+	
+	free(vce);
+	free(lce);
+	
+	free_DIA(exactLeadNumArray);
+	free_DBA(exactMidByteArray);
+	free_DIA(resiBitArray);
+	
+	return midArraySize;
+}
+
+void decompressExactDataArray_float(unsigned char* leadNum, unsigned char* exactMidBytes, unsigned char* residualMidBits, size_t nbEle, int reqLength, float medianValue, float** decData)
+{
+	*decData = (float*)malloc(nbEle*sizeof(float));
+	size_t i = 0, j = 0, k = 0, l = 0, p = 0, curByteIndex = 0;
+	float exactData = 0;
+	unsigned char preBytes[4] = {0,0,0,0};
+	unsigned char curBytes[4];
+	int resiBits; 
+	unsigned char leadingNum;		
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	
+	for(i = 0; i<nbEle;i++)
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data	
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToFloat(curBytes);
+		(*decData)[i] = exactData + medianValue;
+		memcpy(preBytes,curBytes,4);
+	}	
+}
+
+//double-precision version of dealing with unpredictable data points in sz 2.0
+int generateLossyCoefficients_double(double* oriData, double precision, size_t nbEle, int* reqBytesLength, int* resiBitsLength, double* medianValue, double* decData)
+{
+	double valueRangeSize;
+	
+	computeRangeSize_double(oriData, nbEle, &valueRangeSize, medianValue);
+	short radExpo = getExponent_double(valueRangeSize/2);
+	
+	int reqLength;
+	computeReqLength_double(precision, radExpo, &reqLength, medianValue);
+	
+	*reqBytesLength = reqLength/8;
+	*resiBitsLength = reqLength%8;
+	
+	size_t i = 0;
+	for(i = 0;i < nbEle;i++)
+	{
+		double normValue = oriData[i] - *medianValue;
+
+		ldouble ldBuf;
+		ldBuf.value = normValue;
+				
+		int ignBytesLength = 64 - reqLength;
+		if(ignBytesLength<0)
+			ignBytesLength = 0;
+			
+		ldBuf.lvalue = (ldBuf.lvalue >> ignBytesLength) << ignBytesLength;
+		
+		decData[i] = ldBuf.value + *medianValue;
+	}
+	return reqLength;
+}	
+		
+/**
+ * @param double* oriData: inplace argument (input / output)
+ * 
+ * */		
+int compressExactDataArray_double(double* oriData, double precision, size_t nbEle, unsigned char** leadArray, unsigned char** midArray, unsigned char** resiArray, 
+int reqLength, int reqBytesLength, int resiBitsLength, double medianValue)
+{
+	//allocate memory for coefficient compression arrays
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	unsigned char preDataBytes[8] = {0,0,0,0,0,0,0,0};	
+
+	//allocate memory for vce and lce
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));	
+
+	size_t i = 0;
+	for(i = 0;i < nbEle;i++)
+	{
+		compressSingleDoubleValue(vce, oriData[i], precision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		oriData[i] = vce->data;
+	}
+	convertDIAtoInts(exactLeadNumArray, leadArray);
+	convertDBAtoBytes(exactMidByteArray,midArray);
+	convertDIAtoInts(resiBitArray, resiArray);
+
+	size_t midArraySize = exactMidByteArray->size;
+	
+	free(vce);
+	free(lce);
+	
+	free_DIA(exactLeadNumArray);
+	free_DBA(exactMidByteArray);
+	free_DIA(resiBitArray);
+	
+	return midArraySize;
+}
+
+void decompressExactDataArray_double(unsigned char* leadNum, unsigned char* exactMidBytes, unsigned char* residualMidBits, size_t nbEle, int reqLength, double medianValue, double** decData)
+{
+	*decData = (double*)malloc(nbEle*sizeof(double));
+	size_t i = 0, j = 0, k = 0, l = 0, p = 0, curByteIndex = 0;
+	double exactData = 0;
+	unsigned char preBytes[8] = {0,0,0,0,0,0,0,0};
+	unsigned char curBytes[8];
+	int resiBits; 
+	unsigned char leadingNum;		
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	
+	for(i = 0; i<nbEle;i++)
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data	
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToDouble(curBytes);
+		(*decData)[i] = exactData + medianValue;
+		memcpy(preBytes,curBytes,8);
+	}
+}
--- a/deps/SZ/sz/src/dictionary.c
+++ b/deps/SZ/sz/src/dictionary.c
@ -0,0 +1,398 @@
+/*-------------------------------------------------------------------------*/
+/**
+   @file    dictionary.c
+   @author  N. Devillard
+   @brief   Implements a dictionary for string variables.
+
+   This module implements a simple dictionary object, i.e. a list
+   of string/string associations. This object is useful to store e.g.
+   informations retrieved from a configuration file (ini files).
+*/
+/*--------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------
+                                Includes
+ ---------------------------------------------------------------------------*/
+#include "dictionary.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/** Maximum value size for integers and doubles. */
+#define MAXVALSZ    1024
+
+/** Minimal allocated number of entries in a dictionary */
+#define DICTMINSZ   128
+
+/** Invalid key token */
+#define DICT_INVALID_KEY    ((char*)-1)
+
+/*---------------------------------------------------------------------------
+                            Private functions
+ ---------------------------------------------------------------------------*/
+
+/* Doubles the allocated size associated to a pointer */
+/* 'size' is the current allocated size. */
+static void * mem_double(void * ptr, int size)
+{
+    void * newptr ;
+ 
+    newptr = calloc(2*size, 1);
+    if (newptr==NULL) {
+        return NULL ;
+    }
+    memcpy(newptr, ptr, size);
+    free(ptr);
+    return newptr ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Duplicate a string
+  @param    s String to duplicate
+  @return   Pointer to a newly allocated string, to be freed with free()
+
+  This is a replacement for strdup(). This implementation is provided
+  for systems that do not have it.
+ */
+/*--------------------------------------------------------------------------*/
+static char * xstrdup(const char * s)
+{
+    char * t ;
+    if (!s)
+        return NULL ;
+    t = (char*)malloc(strlen(s)+1) ;
+    if (t) {
+        strcpy(t,s);
+    }
+    return t ;
+}
+
+/*---------------------------------------------------------------------------
+                            Function codes
+ ---------------------------------------------------------------------------*/
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Compute the hash key for a string.
+  @param    key     Character string to use for key.
+  @return   1 unsigned int on at least 32 bits.
+
+  This hash function has been taken from an Article in Dr Dobbs Journal.
+  This is normally a collision-free function, distributing keys evenly.
+  The key is stored anyway in the struct so that collision can be avoided
+  by comparing the key itself in last resort.
+ */
+/*--------------------------------------------------------------------------*/
+unsigned dictionary_hash(const char * key)
+{
+    int         len ;
+    unsigned    hash ;
+    int         i ;
+
+    len = strlen(key);
+    for (hash=0, i=0 ; i<len ; i++) {
+        hash += (unsigned)key[i] ;
+        hash += (hash<<10);
+        hash ^= (hash>>6) ;
+    }
+    hash += (hash <<3);
+    hash ^= (hash >>11);
+    hash += (hash <<15);
+    return hash ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Create a new dictionary object.
+  @param    size    Optional initial size of the dictionary.
+  @return   1 newly allocated dictionary objet.
+
+  This function allocates a new dictionary object of given size and returns
+  it. If you do not know in advance (roughly) the number of entries in the
+  dictionary, give size=0.
+ */
+/*--------------------------------------------------------------------------*/
+dictionary * dictionary_new(int size)
+{
+    dictionary  *   d ;
+
+    /* If no size was specified, allocate space for DICTMINSZ */
+    if (size<DICTMINSZ) size=DICTMINSZ ;
+
+    if (!(d = (dictionary *)calloc(1, sizeof(dictionary)))) {
+        return NULL;
+    }
+    d->size = size ;
+    d->val  = (char **)calloc(size, sizeof(char*));
+    d->key  = (char **)calloc(size, sizeof(char*));
+    d->hash = (unsigned int *)calloc(size, sizeof(unsigned));
+    return d ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete a dictionary object
+  @param    d   dictionary object to deallocate.
+  @return   void
+
+  Deallocate a dictionary object and all memory associated to it.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_del(dictionary * d)
+{
+    int     i ;
+
+    if (d==NULL) return ;
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]!=NULL)
+            free(d->key[i]);
+        if (d->val[i]!=NULL)
+            free(d->val[i]);
+    }
+    free(d->val);
+    free(d->key);
+    free(d->hash);
+    free(d);
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get a value from a dictionary.
+  @param    d       dictionary object to search.
+  @param    key     Key to look for in the dictionary.
+  @param    def     Default value to return if key not found.
+  @return   1 pointer to internally allocated character string.
+
+  This function locates a key in a dictionary and returns a pointer to its
+  value, or the passed 'def' pointer if no such key can be found in
+  dictionary. The returned character pointer points to data internal to the
+  dictionary object, you should not try to free it or modify it.
+ */
+/*--------------------------------------------------------------------------*/
+char * dictionary_get(dictionary * d, const char * key, char * def)
+{
+    unsigned    hash ;
+    int         i ;
+
+    hash = dictionary_hash(key);
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        /* Compare hash */
+        if (hash==d->hash[i]) {
+            /* Compare string, to avoid hash collisions */
+            if (!strcmp(key, d->key[i])) {
+                return d->val[i] ;
+            }
+        }
+    }
+    return def ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Set a value in a dictionary.
+  @param    d       dictionary object to modify.
+  @param    key     Key to modify or add.
+  @param    val     Value to add.
+  @return   int     0 if Ok, anything else otherwise
+
+  If the given key is found in the dictionary, the associated value is
+  replaced by the provided one. If the key cannot be found in the
+  dictionary, it is added to it.
+
+  It is Ok to provide a NULL value for val, but NULL values for the dictionary
+  or the key are considered as errors: the function will return immediately
+  in such a case.
+
+  Notice that if you dictionary_set a variable to NULL, a call to
+  dictionary_get will return a NULL value: the variable will be found, and
+  its value (NULL) is returned. In other words, setting the variable
+  content to NULL is equivalent to deleting the variable from the
+  dictionary. It is not possible (in this implementation) to have a key in
+  the dictionary without value.
+
+  This function returns non-zero in case of failure.
+ */
+/*--------------------------------------------------------------------------*/
+int dictionary_set(dictionary * d, const char * key, const char * val)
+{
+    int         i ;
+    unsigned    hash ;
+
+    if (d==NULL || key==NULL) return -1 ;
+    
+    /* Compute hash for this key */
+    hash = dictionary_hash(key) ;
+    /* Find if value is already in dictionary */
+    if (d->n>0) {
+        for (i=0 ; i<d->size ; i++) {
+            if (d->key[i]==NULL)
+                continue ;
+            if (hash==d->hash[i]) { /* Same hash value */
+                if (!strcmp(key, d->key[i])) {   /* Same key */
+                    /* Found a value: modify and return */
+                    if (d->val[i]!=NULL)
+                        free(d->val[i]);
+                    d->val[i] = val ? xstrdup(val) : NULL ;
+                    /* Value has been modified: return */
+                    return 0 ;
+                }
+            }
+        }
+    }
+    /* Add a new value */
+    /* See if dictionary needs to grow */
+    if (d->n==d->size) {
+
+        /* Reached maximum size: reallocate dictionary */
+        d->val  = (char **)mem_double(d->val,  d->size * sizeof(char*)) ;
+        d->key  = (char **)mem_double(d->key,  d->size * sizeof(char*)) ;
+        d->hash = (unsigned int *)mem_double(d->hash, d->size * sizeof(unsigned)) ;
+        if ((d->val==NULL) || (d->key==NULL) || (d->hash==NULL)) {
+            /* Cannot grow dictionary */
+            return -1 ;
+        }
+        /* Double size */
+        d->size *= 2 ;
+    }
+
+    /* Insert key in the first empty slot. Start at d->n and wrap at
+       d->size. Because d->n < d->size this will necessarily
+       terminate. */
+    for (i=d->n ; d->key[i] ; ) {
+        if(++i == d->size) i = 0;
+    }
+    /* Copy key */
+    d->key[i]  = xstrdup(key);
+    d->val[i]  = val ? xstrdup(val) : NULL ;
+    d->hash[i] = hash;
+    d->n ++ ;
+    return 0 ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete a key in a dictionary
+  @param    d       dictionary object to modify.
+  @param    key     Key to remove.
+  @return   void
+
+  This function deletes a key in a dictionary. Nothing is done if the
+  key cannot be found.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_unset(dictionary * d, const char * key)
+{
+    unsigned    hash ;
+    int         i ;
+
+    if (key == NULL) {
+        return;
+    }
+
+    hash = dictionary_hash(key);
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        /* Compare hash */
+        if (hash==d->hash[i]) {
+            /* Compare string, to avoid hash collisions */
+            if (!strcmp(key, d->key[i])) {
+                /* Found key */
+                break ;
+            }
+        }
+    }
+    if (i>=d->size)
+        /* Key not found */
+        return ;
+
+    free(d->key[i]);
+    d->key[i] = NULL ;
+    if (d->val[i]!=NULL) {
+        free(d->val[i]);
+        d->val[i] = NULL ;
+    }
+    d->hash[i] = 0 ;
+    d->n -- ;
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dump a dictionary to an opened file pointer.
+  @param    d   Dictionary to dump
+  @param    f   Opened file pointer.
+  @return   void
+
+  Dumps a dictionary onto an opened file pointer. Key pairs are printed out
+  as @c [Key]=[Value], one per line. It is Ok to provide stdout or stderr as
+  output file pointers.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_dump(dictionary * d, FILE * out)
+{
+    int     i ;
+
+    if (d==NULL || out==NULL) return ;
+    if (d->n<1) {
+        fprintf(out, "empty dictionary\n");
+        return ;
+    }
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]) {
+            fprintf(out, "%20s\t[%s]\n",
+                    d->key[i],
+                    d->val[i] ? d->val[i] : "UNDEF");
+        }
+    }
+    return ;
+}
+
+
+/* Test code */
+#ifdef TESTDIC
+#define NVALS 20000
+int main(int argc, char *argv[])
+{
+    dictionary  *   d ;
+    char    *   val ;
+    int         i ;
+    char        cval[90] ;
+
+    /* Allocate dictionary */
+    printf("allocating...\n");
+    d = dictionary_new(0);
+    
+    /* Set values in dictionary */
+    printf("setting %d values...\n", NVALS);
+    for (i=0 ; i<NVALS ; i++) {
+        sprintf(cval, "%04d", i);
+        dictionary_set(d, cval, "salut");
+    }
+    printf("getting %d values...\n", NVALS);
+    for (i=0 ; i<NVALS ; i++) {
+        sprintf(cval, "%04d", i);
+        val = dictionary_get(d, cval, DICT_INVALID_KEY);
+        if (val==DICT_INVALID_KEY) {
+            printf("cannot get value for key [%s]\n", cval);
+        }
+    }
+    printf("unsetting %d values...\n", NVALS);
+    for (i=0 ; i<NVALS ; i++) {
+        sprintf(cval, "%04d", i);
+        dictionary_unset(d, cval);
+    }
+    if (d->n != 0) {
+        printf("error deleting values\n");
+    }
+    printf("deallocating...\n");
+    dictionary_del(d);
+    return 0 ;
+}
+#endif
+/* vim: set ts=4 et sw=4 tw=75 */
--- a/deps/SZ/sz/src/exafelSZ.c
+++ b/deps/SZ/sz/src/exafelSZ.c
@ -0,0 +1,597 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "sz.h"
+
+void exafelSZ_params_process(exafelSZ_params*pr, size_t panels, size_t rows, size_t cols){
+  pr->binnedRows=(rows+pr->binSize-1)/pr->binSize;
+  pr->binnedCols=(cols+pr->binSize-1)/pr->binSize;
+  
+  pr->peakRadius=(pr->peakSize-1)/2;
+}
+
+void exafelSZ_params_checkDecomp(exafelSZ_params*pr, size_t panels, size_t rows, size_t cols){
+  if(pr->calibPanel==NULL){
+    printf("ERROR: calibPanel is NULL : calibPanel=%ld\n",(long)pr->calibPanel);
+    assert(0);
+  }
+  if(pr->binSize<1 || pr->tolerance<0 || pr->szDim<1 || pr->szDim>3){
+    printf("ERROR: Something wrong with the following:\n");
+    printf("binSize=%d\n",(int)pr->binSize);
+    printf("tolerance=%d\n",(int)pr->tolerance);
+    printf("szDim=%d\n",(int)pr->szDim);
+    assert(0);
+  }
+  if(!(pr->peakSize%2)){
+    printf("ERROR: peakSize = %d cannot be even. It must be odd!\n",(int)pr->peakSize);
+    assert(0);
+  }  
+  //if(nEvents<1 || panels<1 || rows<1 || cols<1){
+  if(panels<1 || rows<1 || cols<1){
+    printf("ERROR: Something wrong with the following:\n");
+    printf("panels=%d\n",(int)panels);
+    printf("rows=%d\n",(int)rows);
+    printf("cols=%d\n",(int)cols);
+    assert(0);
+  }
+}
+
+void exafelSZ_params_checkComp(exafelSZ_params*pr, size_t panels, size_t rows, size_t cols){
+  if(pr->peaksSegs==NULL || pr->peaksRows==NULL || pr->peaksCols==NULL){
+    printf("ERROR: One or more of the following are NULL : peaksSegs , peaksRows , peaksCols\n");
+    assert(0);
+  }
+  exafelSZ_params_checkDecomp(pr, panels, rows, cols);
+}
+
+void exafelSZ_params_print(exafelSZ_params*pr){
+  printf("Configuration (exafelSZ_params) :\n");
+  printf("binSize: %d\n",pr->binSize);
+  printf("tolerance:%e\n",pr->tolerance);
+  printf("szDim:%d\n",pr->szDim);
+  printf("peakSize:%d\n",pr->peakSize);
+  //printf("nEvents:%d\n",pr->nEvents);
+  //printf("panels:%d\n",pr->panels);
+  //printf("rows:%d\n",pr->rows);
+  //printf("cols:%d\n",pr->cols);
+  printf("\n");
+  printf("CALCULATED VARIABLES\n");
+  printf("binnedRows:%ld\n",pr->binnedRows);
+  printf("binnedCols:%ld\n",pr->binnedCols);
+  printf("peakRadius:%d\n",pr->peakRadius);
+  printf("\n");
+  // outs<<"Configuration (exafelSZ_params) : "<<endl;
+  // outs<<"SMOOTHING: NO"<<"  (ROI and RONI are NOT replaced by local avg values)"<<endl;
+  // outs<<"binSize:"<<binSize<<endl;
+  // outs<<"tolerance:"<<tolerance<<endl;
+  // outs<<"szDim:"<<szDim<<endl;
+  // outs<<"peakSize:"<<peakSize<<endl;
+  // outs<<"nEvents:"<<nEvents<<" (# of events per batch)"<<endl;
+  // outs<<"panels:"<<panels<<" (Panels per event)"<<endl;
+  // outs<<"rows:"<<rows<<" (Rows per panel)"<<endl;
+  // outs<<"cols:"<<cols<<" (Columns per panel)"<<endl;
+  // outs<<endl;
+  // outs<<"CALCULATED VARIABLES"<<endl;
+  // outs<<"binnedRows:"<<binnedRows<<" (Rows per panel after binning)"<<endl;
+  // outs<<"binnedCols:"<<binnedCols<<" (Columns per panel after binning)"<<endl;
+  // outs<<"peakRadius:"<<peakRadius<<" (Peak radius = (peakSize-1)/2 )"<<endl;
+  // outs<<endl;
+}
+
+//*********************************************************************************
+//*********************************************************************************
+//*********************************************************************************
+
+//Index Calculator
+static inline size_t calcIdx_4D(int i3, int i2, int i1, int i0, int size2, int size1, int size0){ 
+  return i0+size0*(i1+size1*(i2+size2*i3));
+}
+static inline size_t calcIdx_3D(int i2, int i1, int i0, int size1, int size0){ 
+  return i0+size0*(i1+size1*i2);
+}
+static inline size_t calcIdx_2D(int i1, int i0, int size0){ 
+  return i0+size0*i1;
+}
+
+unsigned char * exafelSZ_Compress(void* _pr,
+                       void* _origData,
+                       size_t r4, size_t r3, size_t r2, size_t r1,
+                       size_t *compressedSize)
+{
+  //printf("COMPRESS\n"); *compressedSize=0; return NULL;
+  size_t nEvents,panels,rows,cols;
+  if(r4==0)
+    nEvents=1;
+  else
+    nEvents=r4;
+  panels=r1;
+  rows=r2;
+  cols=r3;
+  //printf("AMG : exafelSZ_Compress : nEvents,panels,rows,cols = %d , %d , %d , %d\n",nEvents,panels,rows,cols);
+
+  float *origData=(float*)_origData;
+  exafelSZ_params *pr=(exafelSZ_params*)_pr;  
+
+  exafelSZ_params_process(pr, panels, rows, cols);
+  exafelSZ_params_checkComp(pr, panels, rows, cols); 
+  //exafelSZ_params_print(pr);  
+
+  uint8_t *roiM=(uint8_t*)malloc(nEvents*panels*rows*cols) ;
+  float *roiData=(float*)malloc(nEvents*panels*rows*cols*sizeof(float)) ;
+  float *binnedData=(float*)malloc(nEvents*panels*pr->binnedRows*pr->binnedCols*sizeof(float)) ;
+  //float *binnedData=(float*)malloc(nEvents*panels*rows*cols*sizeof(float)) ;
+  
+  size_t e,p,r,c,pk,ri,ci,br,bc,roii,bi;
+  /*
+  printf("AMG : exafelSZ_Compress : pr->numPeaks = %d\n",pr->numPeaks);
+  printf("S:\n");
+  for(e=0;e<pr->numPeaks;e++)
+    printf("%d ",pr->peaksSegs[e]);
+  printf("\nR:\n");
+  for(e=0;e<pr->numPeaks;e++)
+    printf("%d ",pr->peaksRows[e]);
+  printf("\nC:\n");
+  for(e=0;e<pr->numPeaks;e++)
+    printf("%d ",pr->peaksCols[e]);
+  printf("\n");
+  */
+
+  //Generate the ROI mask: NOTE: 0 means affirmative in ROI mask! This comes from the python scripts!
+  //First, initialize with calibration panel:
+  for(e=0;e<nEvents;e++){ //Event
+    for(p=0;p<panels;p++){ //Panel
+      for(r=0;r<rows;r++){ //Row
+        for(c=0;c<cols;c++){ //Column
+          //roiM[calcIdx_4D(e,p,r,c,panels,rows,cols)]=pr->calibPanel[calcIdx_2D(r,c,cols)]; //calibPanel is a single segment copied over all the event(image)
+          roiM[calcIdx_4D(e,p,r,c,panels,rows,cols)]=pr->calibPanel[calcIdx_3D(p,r,c,rows,cols)];  //calibPanel is as big as the event(image) itself
+        }
+      }
+    }
+  }
+  //uint64_t peaksBytePos=0; //Position in the peaks buffer
+  //Now process the peaks and generate the mask:
+  uint64_t nPeaksTotal=0;  //Total number of peaks
+  for(e=0;e<nEvents;e++){ //Event
+    //uint64_t nPeaks=*(uint64_t*)(&pr->peaks[peaksBytePos]);
+    //peaksBytePos+=8;
+
+    //peaksBytePos+=8;//Skip the second one! This is due to the problem in Python.
+
+    nPeaksTotal+=pr->numPeaks;
+    for(pk=0;pk<pr->numPeaks;pk++){
+      //uint16_t p_=*(uint16_t*)(&pr->peaks[peaksBytePos]); //Panel for the current peak
+      //peaksBytePos+=2;
+      //uint16_t r_=*(uint16_t*)(&pr->peaks[peaksBytePos]); //Row for the current peak
+      //peaksBytePos+=2;
+      //uint16_t c_=*(uint16_t*)(&pr->peaks[peaksBytePos]); //Col for the current peak
+      //peaksBytePos+=2;
+      
+      uint16_t p_=pr->peaksSegs[pk];
+      uint16_t r_=pr->peaksRows[pk];
+      uint16_t c_=pr->peaksCols[pk];
+
+      if(p_>=panels){
+        printf("ERROR: Peak coordinate out of bounds: Panel=%d, Valid range: 0,%d\n",(int)p_,(int)panels-1);
+        assert(0);
+        printf("Skipping this peak...\n");
+        continue;
+      }
+      if(r_>=rows){
+        printf("ERROR: Peak coordinate out of bounds: Row=%d, Valid range: 0,%d\n",(int)r_,(int)rows-1);
+        assert(0);
+        printf("Skipping this peak...\n");
+        continue;
+      }
+      if(c_>=cols){
+        printf("ERROR: Peak coordinate out of bounds: Col=%d, Valid range: 0,%d\n",(int)c_,(int)cols-1);
+        assert(0);
+        printf("Skipping this peak...\n");
+        continue;
+      }
+      
+      for(ri=r_-pr->peakRadius;ri<=r_+pr->peakRadius;ri++){  //ri: row index. Just a temporary variable.
+        for(ci=c_-pr->peakRadius;ci<=c_+pr->peakRadius;ci++){  //ci: column index. Just a temporary variable.
+          if(ri<rows && ci<cols){  //Check whether inside the bounds or not
+            roiM[calcIdx_4D(e,p_,ri,ci,panels,rows,cols)]=0;
+          }
+        }
+      }
+    }
+  }
+  
+  //Save ROI:
+  uint64_t roiSavedCount=0;
+  for(e=0;e<nEvents;e++){ //Event
+    for(p=0;p<panels;p++){ //Panel
+      for(r=0;r<rows;r++){ //Row
+        for(c=0;c<cols;c++){ //Column
+          if(!roiM[calcIdx_4D(e,p,r,c,panels,rows,cols)]){
+            roiData[roiSavedCount]=origData[calcIdx_4D(e,p,r,c,panels,rows,cols)];
+            roiSavedCount++;
+          }
+          
+          //AMG: Replace ROI and RONI pixels with avg values!
+          
+        }
+      }
+    }
+  }
+  
+  //Binning:
+  for(e=0;e<nEvents;e++){ //Event
+    for(p=0;p<panels;p++){  //Panel
+      for(r=0;r<pr->binnedRows;r++){ //Row of the binnedData
+        for(c=0;c<pr->binnedCols;c++){ //Column of the binnedData
+          float sum=0;
+          int nPts=0;
+          for(br=0;br<pr->binSize;br++) //Bin Row (from origData)
+            for(bc=0;bc<pr->binSize;bc++) //Bin Column (from origData)
+              if(r*pr->binSize+br<rows && c*pr->binSize+bc<cols){
+                // cout<<p<<" "<<r<<" "<<c<<" "<<br<<" "<<bc<<" "<<r*pr->binSize+br<<" "<<c*pr->binSize+bc<<endl;
+                sum+=origData[calcIdx_4D(e,p,r*pr->binSize+br,c*pr->binSize+bc,panels,rows,cols)];
+                nPts++;
+              }
+          // cout<<"p:"<<p<<" r:"<<r<<" c:"<<c<<" nPts:"<<nPts<<endl;
+          binnedData[calcIdx_4D(e,p,r,c,panels,pr->binnedRows,pr->binnedCols)]=sum/nPts;
+        }
+      }
+    }
+  }
+
+  //Additional compression using SZ:    
+  size_t szCompressedSize=0;
+  unsigned char* szComp;
+   
+  switch(pr->szDim){
+    case 1:
+      // szComp=sz_compress_3D(binnedData, 0, 0, nEvents * panels * pr->binnedRows * pr->binnedCols, pr->tolerance, szCompressedSize); //1D
+      szComp=SZ_compress_args(SZ_FLOAT, binnedData, &szCompressedSize, ABS, pr->tolerance, 0, 0, 0, 0,0,0, nEvents * panels * pr->binnedRows * pr->binnedCols);
+      break;
+    case 2:
+      // szComp=sz_compress_3D(binnedData, 0, nEvents * panels * pr->binnedRows, pr->binnedCols, pr->tolerance, szCompressedSize); //2D
+      szComp=SZ_compress_args(SZ_FLOAT, binnedData, &szCompressedSize, ABS, pr->tolerance, 0, 0, 0, 0,0, nEvents * panels * pr->binnedRows, pr->binnedCols);
+      break;
+    case 3:
+      // szComp=sz_compress_3D(binnedData, nEvents * panels, pr->binnedRows, pr->binnedCols, pr->tolerance, szCompressedSize); //3D
+      szComp=SZ_compress_args(SZ_FLOAT, binnedData, &szCompressedSize, ABS, pr->tolerance, 0, 0, 0, 0, nEvents * panels, pr->binnedRows, pr->binnedCols);
+      break;
+    default:
+      printf("ERROR: Wrong szDim : %d It must be 1,2 or 3.\n",(int)pr->szDim);
+      assert(0);
+  }
+  
+  /*      
+  Compressed buffer format: (Types are indicated in parenthesis)
+    WRITE: nPeaksTotal(uint64_t) (Total number of peaks in this batch)
+    for(e=0;e<nEvents;e++){  (e for "event")
+      WRITE: nPeaks[e]  (uint64_t) (Number of peaks in this event)
+      for(p=0;p<nPeaks;p++){  (p for "peak")
+       nPeaks{
+         WRITE: peak[e][p] (uint16_t x 3)
+       }
+    }
+    WRITE: roiSavedCount  (uint64_t) (How many pixels there are in the ROI data.)
+       (roiSavedCount is the same # as # of 0's in ROI mask.) 
+       (NOTE:0 means affirmative in ROI mask!)
+    for(roii=0;roii<roiSavedCount;roii++){  (roii for "ROI data index")
+      WRITE: ROI_data[roii]  (float, 32-bit)
+    }
+    WRITE: szCompressedSize  (uint64_t) (Compressed data size from SZ.)
+    WRITE: szComp (unsigned char x SZ_compressed_buffer_size)  (Compressed data from SZ.)
+    
+    NOTE: Calibration panel is not saved. It should be handled by the user.
+    
+    SUMMARY:
+    nPeaksTotal : 8 bytes : (1 x uint64_t)
+    peaks : (8 x nEvents + nPeaksTotal x 3 x 2) bytes : (nEvents x (nPeaks + nPeaks x 3 x uint16_t))
+    roiSavedCount : 8 Bytes : (1 x uint64_t)
+    ROI_data : roiSavedCount x 4 : roiSavedCount x float 
+    szCompressedSize : 8 : uint64_t
+    szComp : szComp x 1 : szComp x (unsigned char)
+  */
+  (*compressedSize)=8+nEvents*8+nPeaksTotal*(2+2+2)+8+roiSavedCount*4+8+szCompressedSize;
+  //compressedBuffer=new uint8_t[(*compressedSize)];
+  uint8_t * compressedBuffer=(uint8_t*)malloc(*compressedSize);
+  uint64_t bytePos;
+  
+  bytePos=0;
+  //*(uint64_t*)(&compressedBuffer[bytePos])=nEvents;
+  //bytePos+=8;
+  *(uint64_t*)(&compressedBuffer[bytePos])=nPeaksTotal;
+  bytePos+=8;
+  // cout<<endl;
+  // cout<<"COMPRESS:"<<endl;
+  // cout<<"nPeaksTotal="<<nPeaksTotal<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("\nCOMPRESS:\n");
+  //printf("nPeaksTotal=%d\n",nPeaksTotal);
+  //printf("bytePos=%d\n",bytePos);
+  
+  //peaksBytePos=0;
+  for(e=0;e<nEvents;e++){
+    //uint64_t nPeaks=*(uint64_t*)(&pr->peaks[peaksBytePos]);
+    //peaksBytePos+=8;
+    ////peaksBytePos+=8;//Skip the second one. This is due to the error in Python!
+    
+    //*(uint64_t*)(&compressedBuffer[bytePos])=nPeaks;
+    *(uint64_t*)(&compressedBuffer[bytePos])=pr->numPeaks;
+    bytePos+=8;
+    //for(pk=0;pk<nPeaks;pk++){
+    for(pk=0;pk<pr->numPeaks;pk++){
+      //*(uint16_t*)(&compressedBuffer[bytePos])=*(uint16_t*)(&pr->peaks[peaksBytePos]); //Panel for the current peak
+      //bytePos+=2;
+      //peaksBytePos+=2;
+      //*(uint16_t*)(&compressedBuffer[bytePos])=*(uint16_t*)(&pr->peaks[peaksBytePos]); //Row for the current peak
+      //bytePos+=2;
+      //peaksBytePos+=2;      
+      //*(uint16_t*)(&compressedBuffer[bytePos])=*(uint16_t*)(&pr->peaks[peaksBytePos]); //Column for the current peak
+      //bytePos+=2;
+      //peaksBytePos+=2;
+
+      *(uint16_t*)(&compressedBuffer[bytePos])=pr->peaksSegs[pk]; //Panel for the current peak
+      bytePos+=2;
+      *(uint16_t*)(&compressedBuffer[bytePos])=pr->peaksRows[pk]; //Row for the current peak
+      bytePos+=2;
+      *(uint16_t*)(&compressedBuffer[bytePos])=pr->peaksCols[pk]; //Column for the current peak
+      bytePos+=2;
+    }
+  }
+  // cout<<"peaks"<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("peaks\n");
+  //printf("bytePos=%d\n",bytePos);
+
+  *(uint64_t*)(&compressedBuffer[bytePos])=roiSavedCount;
+  bytePos+=8;
+  // cout<<"roiSavedCount="<<roiSavedCount<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  // cout<<"roiData"<<endl;
+  //printf("roiSavedCount=%d\n",roiSavedCount);
+  //printf("bytePos=%d\n",bytePos);
+  //printf("roiData\n");
+  for(roii=0;roii<roiSavedCount;roii++){
+    *(float*)(&compressedBuffer[bytePos])=roiData[roii];
+    // cout<<roiData[roii]<<",";
+    bytePos+=4;
+  }
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("bytePos=%d\n",bytePos);
+  *(uint64_t*)(&compressedBuffer[bytePos])=szCompressedSize;
+  bytePos+=8;
+  // cout<<"szCompressedSize="<<szCompressedSize<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("szCompressedSize=%d\n",szCompressedSize);
+  //printf("bytePos=%d\n",bytePos);
+  for(bi=0;bi<szCompressedSize;bi++){  //bi for "byte index"
+    *(unsigned char*)(&compressedBuffer[bytePos])=szComp[bi];
+    bytePos+=1;
+  }
+  // cout<<"szComp"<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("szComp\n");
+  //printf("bytePos=%d\n",bytePos);
+  
+  if(bytePos!=(*compressedSize)){
+    printf("ERROR: bytePos = %ld != %ld = compressedSize\n",(long)bytePos,(long)compressedSize);
+    assert(0);
+  }
+  
+  free(szComp);
+  free(roiM);
+  free(roiData);
+  free(binnedData);
+  // delete [] roiM;
+  // delete [] roiData;
+  // delete [] binnedData;
+  
+  return compressedBuffer;
+}
+
+void* exafelSZ_Decompress(void *_pr,
+                         unsigned char*_compressedBuffer,
+                         size_t r4, size_t r3, size_t r2, size_t r1,
+                         size_t compressedSize)
+{ 
+  size_t nEvents,panels,rows,cols;
+  if(r4==0)
+    nEvents=1;
+  else
+    nEvents=r4;
+  panels=r1;
+  rows=r2;
+  cols=r3;
+  //printf("AMG : exafelSZ_Decompress : nEvents,panels,rows,cols = %d , %d , %d , %d\n",nEvents,panels,rows,cols);
+
+  //printf("DECOMPRESS\n");return NULL;
+  uint8_t *compressedBuffer=(uint8_t *)_compressedBuffer;
+  exafelSZ_params *pr=(exafelSZ_params *)_pr;
+  exafelSZ_params_process(pr, panels, rows, cols); 
+  exafelSZ_params_checkDecomp(pr, panels, rows, cols); 
+  
+  float *decompressedBuffer=(float*)malloc(nEvents*panels*rows*cols*sizeof(float));
+  
+  uint8_t *roiM=(uint8_t*)malloc(nEvents*panels*rows*cols);
+  size_t e,p,r,c,pk,ri,ci,br,bc;
+  
+  /*
+  Compressed Data Layout:
+  nPeaksTotal : 8 bytes : (1 x uint64_t)
+  peaks : (8 x nEvents + nPeaksTotal x 3 x 2) bytes : (nEvents x (nPeaks + nPeaks x 3 x uint16_t))
+  roiSavedCount : 8 Bytes : (1 x uint64_t)
+  ROI_data : roiSavedCount x 4 : roiSavedCount x float 
+  szCompressedSize : 8 : uint64_t
+  szComp : szComp x 1 : szComp x (unsigned char)
+  */
+  uint64_t bytePos=0;
+  uint64_t nPeaksTotal=*(uint64_t*)(&compressedBuffer[bytePos]);
+  bytePos += 8; 
+  // cout<<endl;
+  // cout<<"DECOMPRESS:"<<endl;
+  // cout<<"nPeaksTotal="<<nPeaksTotal<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("\nDECOMPRESS:\n");
+  //printf("nPeaksTotal=%d\n",nPeaksTotal);
+  //printf("bytePos=%d\n",bytePos);
+  
+  uint8_t *peaks=(uint8_t*)(&compressedBuffer[bytePos]);
+  bytePos += (8 * nEvents + nPeaksTotal * 3 * 2);
+  // cout<<"peaks"<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("peaks\n");
+  //printf("bytePos=%d\n",bytePos);
+  
+  uint64_t roiSavedCount=*(uint64_t*)(&compressedBuffer[bytePos]);
+  bytePos+=8;
+  // cout<<"roiSavedCount="<<roiSavedCount<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("roiSavedCount=%d\n",roiSavedCount);
+  //printf("bytePos=%d\n",bytePos);
+  
+  // cout<<"roiData"<<endl;
+  float *roiData=(float*)(&compressedBuffer[bytePos]);
+  bytePos+=(roiSavedCount*4);
+  // for(uint64_t roii=0;roii<roiSavedCount;roii++){
+    // cout<<roiData[roii]<<",";
+  // }
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("bytePos=%d\n",bytePos);
+  
+  uint64_t szCompressedSize=*(uint64_t*)(&compressedBuffer[bytePos]);
+  bytePos+=8;
+  // cout<<"szCompressedSize="<<szCompressedSize<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  //printf("szCompressedSize=%d\n",szCompressedSize);
+  //printf("bytePos=%d\n",bytePos);
+  
+  unsigned char *szComp=(unsigned char*)(&compressedBuffer[bytePos]);
+  bytePos+=szCompressedSize;
+  // cout<<"szComp"<<endl;
+  // cout<<"bytePos="<<bytePos<<endl;
+  // cout<<endl;
+  //printf("szComp\n");
+  //printf("bytePos=%d\n\n",bytePos);
+  
+  //We should have inputs ready by now. Now process them:
+  
+  //Generate the ROI mask: NOTE: 0 means affirmative in ROI mask! This comes from the python scripts!
+  //First, initialize with calibration panel:
+  for(e=0;e<nEvents;e++){ //Event
+    for(p=0;p<panels;p++){ //Panel
+      for(r=0;r<rows;r++){ //Row
+        for(c=0;c<cols;c++){ //Column
+          if(calcIdx_2D(r,c,cols)<0 ||calcIdx_2D(r,c,cols)>=rows*cols){
+            printf("ERROR: calcIdx_2D(r,c,cols) = calcIdx_2D(%d,%d,%d) = %d",(int)r,(int)c,(int)cols,(int)calcIdx_2D(r,c,cols));
+            printf("       is NOT in the correct range: [0,%ld]",(int)rows*cols-1);
+            assert(0);
+          }
+          if(calcIdx_4D(e,p,r,c,panels,rows,cols)<0 ||calcIdx_4D(e,p,r,c,panels,rows,cols)>=nEvents*panels*rows*cols){
+            printf("ERROR: calcIdx_4D(e,p,r,c,panels,rows,cols) = calcIdx_4D(%d,%d,%d,%d,%d,%d,%d) = %d",(int)e,(int)p,(int)r,(int)c,(int)panels,(int)rows,(int)cols,(int)calcIdx_4D(e,p,r,c,panels,rows,cols));
+            assert(0);
+          }
+          //roiM[calcIdx_4D(e,p,r,c,panels,rows,cols)]=pr->calibPanel[calcIdx_2D(r,c,cols)]; //calibPanel is a single segment copied over all the event(image)
+          roiM[calcIdx_4D(e,p,r,c,panels,rows,cols)]=pr->calibPanel[calcIdx_3D(p,r,c,rows,cols)];  //calibPanel is as big as the event(image) itself
+        }
+      }
+    }
+  }
+  uint64_t peaksBytePos=0; //Position in the peaks buffer
+  //Now process the peaks and generate the mask:
+  for(e=0;e<nEvents;e++){ //Event
+    uint64_t nPeaks=*(uint64_t*)(&peaks[peaksBytePos]);
+    peaksBytePos+=8;
+    
+    for(pk=0;pk<nPeaks;pk++){
+      uint16_t p_=*(uint16_t*)(&peaks[peaksBytePos]); //Panel for the current peak
+      peaksBytePos+=2;
+      uint16_t r_=*(uint16_t*)(&peaks[peaksBytePos]); //Row for the current peak
+      peaksBytePos+=2;
+      uint16_t c_=*(uint16_t*)(&peaks[peaksBytePos]); //Col for the current peak
+      peaksBytePos+=2;
+      
+      if(p_>=panels){
+        printf("ERROR: Peak coordinate out of bounds: Panel=%d, Valid range: 0,%d\n",(int)p_,(int)panels-1);
+        assert(0);
+        printf("Skipping this peak...\n");
+        continue;
+      }
+      if(r_>=rows){
+        printf("ERROR: Peak coordinate out of bounds: Row=%d, Valid range: 0,%d\n",(int)r_,(int)rows-1);
+        assert(0);
+        printf("Skipping this peak...\n");
+        continue;
+      }
+      if(c_>=cols){
+        printf("ERROR: Peak coordinate out of bounds: Col=%d, Valid range: 0,%d\n",(int)c_,(int)cols-1);
+        assert(0);
+        printf("Skipping this peak...\n");
+        continue;
+      }
+      
+      for(ri=r_-pr->peakRadius;ri<=r_+pr->peakRadius;ri++){  //ri: row index. Just a temporary variable.
+        for(ci=c_-pr->peakRadius;ci<=c_+pr->peakRadius;ci++){  //ci: column index. Just a temporary variable.
+          if(ri>=0 && ri<rows && ci>=0 && ci<cols){  //Check whether inside bounds or not
+            roiM[calcIdx_4D(e,p_,ri,ci,panels,rows,cols)]=0;
+          }
+        }
+      }
+    }
+  }
+  
+  //De-compress using SZ:
+  float* szDecomp;
+  size_t _szCompressedSize=szCompressedSize;
+  switch(pr->szDim){
+    case 1:
+      szDecomp=SZ_decompress(SZ_FLOAT,szComp,_szCompressedSize,0,0,0,0, nEvents * panels * pr->binnedRows * pr->binnedCols);
+      break;
+    case 2:
+      szDecomp=SZ_decompress(SZ_FLOAT,szComp,_szCompressedSize,0,0,0, nEvents * panels * pr->binnedRows, pr->binnedCols);
+      break;
+    case 3:
+      szDecomp=SZ_decompress(SZ_FLOAT,szComp,_szCompressedSize,0,0,nEvents * panels, pr->binnedRows, pr->binnedCols);
+      break;
+    default:
+      printf("ERROR: Wrong szDim : %d It must be 1,2 or 3.\n",(int)pr->szDim);
+      assert(0);
+  }
+  //szDecomp=(void*)malloc(nEvents*panels*rows*cols*sizeof(float));
+  
+  // double max_err = 0;
+  // for(int i=0; i<nEvents * panels * pr->binnedRows * pr->binnedCols; i++){
+    // double err = fabs(szDecomp[i]-binnedData[i]);
+    // if(err > max_err) max_err = err;
+  // }
+  // cout << "Max err = \t\t\t" << max_err << endl;
+  
+
+  //De-binning:
+  for(e=0;e<nEvents;e++)//Event
+    for(p=0;p<panels;p++)  //Panel
+      for(r=0;r<pr->binnedRows;r++) //Row of the binnedData
+        for(c=0;c<pr->binnedCols;c++) //Column of the binnedData
+            for(br=0;br<pr->binSize;br++) //Bin Row (from origData)
+              for(bc=0;bc<pr->binSize;bc++) //Bin Column (from origData)
+                if(r*pr->binSize+br<rows && c*pr->binSize+bc<cols){
+                  decompressedBuffer[calcIdx_4D(e,p,r*pr->binSize+br,c*pr->binSize+bc,panels,rows,cols)] = szDecomp[calcIdx_4D(e,p,r,c,panels,pr->binnedRows,pr->binnedCols)];
+                }
+  //Restore ROI:
+  uint64_t current=0;
+  for(e=0;e<nEvents;e++)//Event
+    for(p=0;p<panels;p++)  //Panel
+      for(r=0;r<rows;r++) //Row of the binnedData
+        for(c=0;c<cols;c++) //Column of the binnedData
+          if(!roiM[calcIdx_4D(e,p,r,c,panels,rows,cols)]){
+            decompressedBuffer[calcIdx_4D(e,p,r,c,panels,rows,cols)]=roiData[current];
+            current++;
+          }
+  // delete [] roiM;
+  free(roiM);
+  free(szDecomp);
+  
+  return ((void*)decompressedBuffer);
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/deps/SZ/sz/src/iniparser.c
+++ b/deps/SZ/sz/src/iniparser.c
@ -0,0 +1,774 @@
+
+/*-------------------------------------------------------------------------*/
+/**
+   @file    iniparser.c
+   @author  N. Devillard
+   @brief   Parser for ini files.
+*/
+/*--------------------------------------------------------------------------*/
+/*---------------------------- Includes ------------------------------------*/
+#include <ctype.h>
+#include "iniparser.h"
+
+/*---------------------------- Defines -------------------------------------*/
+#define ASCIILINESZ         (1024)
+#define INI_INVALID_KEY     ((char*)-1)
+
+/*---------------------------------------------------------------------------
+                        Private to this module
+ ---------------------------------------------------------------------------*/
+/**
+ * This enum stores the status for each parsed line (internal use only).
+ */
+typedef enum _line_status_ {
+    LINE_UNPROCESSED,
+    LINE_ERROR,
+    LINE_EMPTY,
+    LINE_COMMENT,
+    LINE_SECTION,
+    LINE_VALUE
+} line_status ;
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Convert a string to lowercase.
+  @param    s   String to convert.
+  @return   ptr to statically allocated string.
+
+  This function returns a pointer to a statically allocated string
+  containing a lowercased version of the input string. Do not free
+  or modify the returned string! Since the returned string is statically
+  allocated, it will be modified at each function call (not re-entrant).
+ */
+/*--------------------------------------------------------------------------*/
+static char * strlwc(const char * s)
+{
+    static char l[ASCIILINESZ+1];
+    int i ;
+
+    if (s==NULL) return NULL ;
+    memset(l, 0, ASCIILINESZ+1);
+    i=0 ;
+    while (s[i] && i<ASCIILINESZ) {
+        l[i] = (char)tolower((int)s[i]);
+        i++ ;
+    }
+    l[ASCIILINESZ]=(char)0;
+    return l ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Remove blanks at the beginning and the end of a string.
+  @param    s   String to parse.
+  @return   ptr to statically allocated string.
+
+  This function returns a pointer to a statically allocated string,
+  which is identical to the input string, except that all blank
+  characters at the end and the beg. of the string have been removed.
+  Do not free or modify the returned string! Since the returned string
+  is statically allocated, it will be modified at each function call
+  (not re-entrant).
+ */
+/*--------------------------------------------------------------------------*/
+static char * strstrip(const char * s)
+{
+    static char l[ASCIILINESZ+1];
+    char * last;
+
+    if (s==NULL) return NULL ;
+
+    while (isspace((int)*s) && *s) s++;
+    memset(l, 0, ASCIILINESZ+1);
+    strncpy(l, s, ASCIILINESZ);
+    last = l + strlen(l);
+    while (last > l) {
+        if (!isspace((int)*(last-1)))
+            break ;
+        last -- ;
+    }
+    *last = (char)0;
+    return (char*)l ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get number of sections in a dictionary
+  @param    d   Dictionary to examine
+  @return   int Number of sections found in dictionary
+
+  This function returns the number of sections found in a dictionary.
+  The test to recognize sections is done on the string stored in the
+  dictionary: a section name is given as "section" whereas a key is
+  stored as "section:key", thus the test looks for entries that do not
+  contain a colon.
+
+  This clearly fails in the case a section name contains a colon, but
+  this should simply be avoided.
+
+  This function returns -1 in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getnsec(dictionary * d)
+{
+    int i ;
+    int nsec ;
+
+    if (d==NULL) return -1 ;
+    nsec=0 ;
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        if (strchr(d->key[i], ':')==NULL) {
+            nsec ++ ;
+        }
+    }
+    return nsec ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get name for section n in a dictionary.
+  @param    d   Dictionary to examine
+  @param    n   Section number (from 0 to nsec-1).
+  @return   Pointer to char string
+
+  This function locates the n-th section in a dictionary and returns
+  its name as a pointer to a string statically allocated inside the
+  dictionary. Do not free or modify the returned string!
+
+  This function returns NULL in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+char * iniparser_getsecname(dictionary * d, int n)
+{
+    int i ;
+    int foundsec ;
+
+    if (d==NULL || n<0) return NULL ;
+    foundsec=0 ;
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        if (strchr(d->key[i], ':')==NULL) {
+            foundsec++ ;
+            if (foundsec>n)
+                break ;
+        }
+    }
+    if (foundsec<=n) {
+        return NULL ;
+    }
+    return d->key[i] ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dump a dictionary to an opened file pointer.
+  @param    d   Dictionary to dump.
+  @param    f   Opened file pointer to dump to.
+  @return   void
+
+  This function prints out the contents of a dictionary, one element by
+  line, onto the provided file pointer. It is OK to specify @c stderr
+  or @c stdout as output files. This function is meant for debugging
+  purposes mostly.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_dump(dictionary * d, FILE * f)
+{
+    int     i ;
+
+    if (d==NULL || f==NULL) return ;
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        if (d->val[i]!=NULL) {
+            fprintf(f, "[%s]=[%s]\n", d->key[i], d->val[i]);
+        } else {
+            fprintf(f, "[%s]=UNDEF\n", d->key[i]);
+        }
+    }
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Save a dictionary to a loadable ini file
+  @param    d   Dictionary to dump
+  @param    f   Opened file pointer to dump to
+  @return   void
+
+  This function dumps a given dictionary into a loadable ini file.
+  It is Ok to specify @c stderr or @c stdout as output files.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_dump_ini(dictionary * d, FILE * f)
+{
+    int     i ;
+    int     nsec ;
+    char *  secname ;
+
+    if (d==NULL || f==NULL) return ;
+
+    nsec = iniparser_getnsec(d);
+    if (nsec<1) {
+        /* No section in file: dump all keys as they are */
+        for (i=0 ; i<d->size ; i++) {
+            if (d->key[i]==NULL)
+                continue ;
+            fprintf(f, "%s = %s\n", d->key[i], d->val[i]);
+        }
+        return ;
+    }
+    for (i=0 ; i<nsec ; i++) {
+        secname = iniparser_getsecname(d, i) ;
+        iniparser_dumpsection_ini(d, secname, f) ;
+    }
+    fprintf(f, "\n");
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Save a dictionary section to a loadable ini file
+  @param    d   Dictionary to dump
+  @param    s   Section name of dictionary to dump
+  @param    f   Opened file pointer to dump to
+  @return   void
+
+  This function dumps a given section of a given dictionary into a loadable ini
+  file.  It is Ok to specify @c stderr or @c stdout as output files.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_dumpsection_ini(dictionary * d, char * s, FILE * f)
+{
+    int     j ;
+    char    keym[ASCIILINESZ+1];
+    int     seclen ;
+
+    if (d==NULL || f==NULL) return ;
+    if (! iniparser_find_entry(d, s)) return ;
+
+    seclen  = (int)strlen(s);
+    fprintf(f, "\n[%s]\n", s);
+    sprintf(keym, "%s:", s);
+    for (j=0 ; j<d->size ; j++) {
+        if (d->key[j]==NULL)
+            continue ;
+        if (!strncmp(d->key[j], keym, seclen+1)) {
+            fprintf(f,
+                    "%-30s = %s\n",
+                    d->key[j]+seclen+1,
+                    d->val[j] ? d->val[j] : "");
+        }
+    }
+    fprintf(f, "\n");
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the number of keys in a section of a dictionary.
+  @param    d   Dictionary to examine
+  @param    s   Section name of dictionary to examine
+  @return   Number of keys in section
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getsecnkeys(dictionary * d, char * s)
+{
+    int     seclen, nkeys ;
+    char    keym[ASCIILINESZ+1];
+    int j ;
+
+    nkeys = 0;
+
+    if (d==NULL) return nkeys;
+    if (! iniparser_find_entry(d, s)) return nkeys;
+
+    seclen  = (int)strlen(s);
+    sprintf(keym, "%s:", s);
+
+    for (j=0 ; j<d->size ; j++) {
+        if (d->key[j]==NULL)
+            continue ;
+        if (!strncmp(d->key[j], keym, seclen+1))
+            nkeys++;
+    }
+
+    return nkeys;
+
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the number of keys in a section of a dictionary.
+  @param    d   Dictionary to examine
+  @param    s   Section name of dictionary to examine
+  @return   pointer to statically allocated character strings
+
+  This function queries a dictionary and finds all keys in a given section.
+  Each pointer in the returned char pointer-to-pointer is pointing to
+  a string allocated in the dictionary; do not free or modify them.
+
+  This function returns NULL in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+char ** iniparser_getseckeys(dictionary * d, char * s)
+{
+
+    char **keys;
+
+    int i, j ;
+    char    keym[ASCIILINESZ+1];
+    int     seclen, nkeys ;
+
+    keys = NULL;
+
+    if (d==NULL) return keys;
+    if (! iniparser_find_entry(d, s)) return keys;
+
+    nkeys = iniparser_getsecnkeys(d, s);
+
+    keys = (char**) malloc(nkeys*sizeof(char*));
+
+    seclen  = (int)strlen(s);
+    sprintf(keym, "%s:", s);
+
+    i = 0;
+
+    for (j=0 ; j<d->size ; j++) {
+        if (d->key[j]==NULL)
+            continue ;
+        if (!strncmp(d->key[j], keym, seclen+1)) {
+            keys[i] = d->key[j];
+            i++;
+        }
+    }
+
+    return keys;
+
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key
+  @param    d       Dictionary to search
+  @param    key     Key string to look for
+  @param    def     Default value to return if key not found.
+  @return   pointer to statically allocated character string
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the pointer passed as 'def' is returned.
+  The returned char pointer is pointing to a string allocated in
+  the dictionary, do not free or modify it.
+ */
+/*--------------------------------------------------------------------------*/
+char * iniparser_getstring(dictionary * d, const char * key, char * def)
+{
+    char * lc_key ;
+    char * sval ;
+
+    if (d==NULL || key==NULL)
+        return def ;
+
+    lc_key = strlwc(key);
+    sval = dictionary_get(d, lc_key, def);
+    return sval ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to an int
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   integer
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+
+  Supported values for integers include the usual C notation
+  so decimal, octal (starting with 0) and hexadecimal (starting with 0x)
+  are supported. Examples:
+
+  "42"      ->  42
+  "042"     ->  34 (octal -> decimal)
+  "0x42"    ->  66 (hexa  -> decimal)
+
+  Warning: the conversion may overflow in various ways. Conversion is
+  totally outsourced to strtol(), see the associated man page for overflow
+  handling.
+
+  Credits: Thanks to A. Becker for suggesting strtol()
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getint(dictionary * d, const char * key, int notfound)
+{
+    char    *   str ;
+
+    str = iniparser_getstring(d, key, INI_INVALID_KEY);
+    if (str==INI_INVALID_KEY) return notfound ;
+    return (int)strtol(str, NULL, 0);
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a long
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   long
+
+  Credits: This function bases completely on int iniparser_getint and was
+  slightly modified to return long instead of int.
+ */
+/*--------------------------------------------------------------------------*/
+long iniparser_getlint(dictionary * d, const char * key, int notfound)
+{
+    char    *   str ;
+
+    str = iniparser_getstring(d, key, INI_INVALID_KEY);
+    if (str==INI_INVALID_KEY) return notfound ;
+    return strtol(str, NULL, 0);
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a double
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   double
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+ */
+/*--------------------------------------------------------------------------*/
+double iniparser_getdouble(dictionary * d, const char * key, double notfound)
+{
+    char    *   str ;
+
+    str = iniparser_getstring(d, key, INI_INVALID_KEY);
+    if (str==INI_INVALID_KEY) return notfound ;
+    return atof(str);
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a boolean
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   integer
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+
+  A true boolean is found if one of the following is matched:
+
+  - A string starting with 'y'
+  - A string starting with 'Y'
+  - A string starting with 't'
+  - A string starting with 'T'
+  - A string starting with '1'
+
+  A false boolean is found if one of the following is matched:
+
+  - A string starting with 'n'
+  - A string starting with 'N'
+  - A string starting with 'f'
+  - A string starting with 'F'
+  - A string starting with '0'
+
+  The notfound value returned if no boolean is identified, does not
+  necessarily have to be 0 or 1.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getboolean(dictionary * d, const char * key, int notfound)
+{
+    char    *   c ;
+    int         ret ;
+
+    c = iniparser_getstring(d, key, INI_INVALID_KEY);
+    if (c==INI_INVALID_KEY) return notfound ;
+    if (c[0]=='y' || c[0]=='Y' || c[0]=='1' || c[0]=='t' || c[0]=='T') {
+        ret = 1 ;
+    } else if (c[0]=='n' || c[0]=='N' || c[0]=='0' || c[0]=='f' || c[0]=='F') {
+        ret = 0 ;
+    } else {
+        ret = notfound ;
+    }
+    return ret;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Finds out if a given entry exists in a dictionary
+  @param    ini     Dictionary to search
+  @param    entry   Name of the entry to look for
+  @return   integer 1 if entry exists, 0 otherwise
+
+  Finds out if a given entry exists in the dictionary. Since sections
+  are stored as keys with NULL associated values, this is the only way
+  of querying for the presence of sections in a dictionary.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_find_entry(
+    dictionary  *   ini,
+    const char  *   entry
+)
+{
+    int found=0 ;
+    if (iniparser_getstring(ini, entry, INI_INVALID_KEY)!=INI_INVALID_KEY) {
+        found = 1 ;
+    }
+    return found ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Set an entry in a dictionary.
+  @param    ini     Dictionary to modify.
+  @param    entry   Entry to modify (entry name)
+  @param    val     New value to associate to the entry.
+  @return   int 0 if Ok, -1 otherwise.
+
+  If the given entry can be found in the dictionary, it is modified to
+  contain the provided value. If it cannot be found, -1 is returned.
+  It is Ok to set val to NULL.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_set(dictionary * ini, const char * entry, const char * val)
+{
+    return dictionary_set(ini, strlwc(entry), val) ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete an entry in a dictionary
+  @param    ini     Dictionary to modify
+  @param    entry   Entry to delete (entry name)
+  @return   void
+
+  If the given entry can be found, it is deleted from the dictionary.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_unset(dictionary * ini, const char * entry)
+{
+    dictionary_unset(ini, strlwc(entry));
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Load a single line from an INI file
+  @param    input_line  Input line, may be concatenated multi-line input
+  @param    section     Output space to store section
+  @param    key         Output space to store key
+  @param    value       Output space to store value
+  @return   line_status value
+ */
+/*--------------------------------------------------------------------------*/
+static line_status iniparser_line(
+    const char * input_line,
+    char * section,
+    char * key,
+    char * value)
+{
+    line_status sta ;
+    char        line[ASCIILINESZ+1];
+    int         len ;
+
+    memset(line, 0, ASCIILINESZ + 1);
+    len = (int)strlen(strstrip(input_line));
+    if (len > ASCIILINESZ)
+        len = ASCIILINESZ;
+    strncpy(line, strstrip(input_line), len);
+    len = (int)strlen(line);
+
+    sta = LINE_UNPROCESSED ;
+    if (len<1) {
+        /* Empty line */
+        sta = LINE_EMPTY ;
+    } else if (line[0]=='#' || line[0]==';') {
+        /* Comment line */
+        sta = LINE_COMMENT ;
+    } else if (line[0]=='[' && line[len-1]==']') {
+        /* Section name */
+        sscanf(line, "[%[^]]", section);
+        strcpy(section, strstrip(section));
+        strcpy(section, strlwc(section));
+        sta = LINE_SECTION ;
+    } else if (sscanf (line, "%[^=] = \"%[^\"]\"", key, value) == 2
+           ||  sscanf (line, "%[^=] = '%[^\']'",   key, value) == 2
+           ||  sscanf (line, "%[^=] = %[^;#]",     key, value) == 2) {
+        /* Usual key=value, with or without comments */
+        strcpy(key, strstrip(key));
+        strcpy(key, strlwc(key));
+        strcpy(value, strstrip(value));
+        /*
+         * sscanf cannot handle '' or "" as empty values
+         * this is done here
+         */
+        if (!strcmp(value, "\"\"") || (!strcmp(value, "''"))) {
+            value[0]=0 ;
+        }
+        sta = LINE_VALUE ;
+    } else if (sscanf(line, "%[^=] = %[;#]", key, value)==2
+           ||  sscanf(line, "%[^=] %[=]", key, value) == 2) {
+        /*
+         * Special cases:
+         * key=
+         * key=;
+         * key=#
+         */
+        strcpy(key, strstrip(key));
+        strcpy(key, strlwc(key));
+        value[0]=0 ;
+        sta = LINE_VALUE ;
+    } else {
+        /* Generate syntax error */
+        sta = LINE_ERROR ;
+        printf("===== > %s   ===> %s\n", input_line, line);
+    }
+    return sta ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Parse an ini file and return an allocated dictionary object
+  @param    ininame Name of the ini file to read.
+  @return   Pointer to newly allocated dictionary
+
+  This is the parser for ini files. This function is called, providing
+  the name of the file to be read. It returns a dictionary object that
+  should not be accessed directly, but through accessor functions
+  instead.
+
+  The returned dictionary must be freed using iniparser_freedict().
+ */
+/*--------------------------------------------------------------------------*/
+dictionary * iniparser_load(const char * ininame)
+{
+    FILE * in ;
+
+    char line    [ASCIILINESZ+1] ;
+    char section [ASCIILINESZ+1] ;
+    char key     [ASCIILINESZ+1] ;
+    char tmp     [ASCIILINESZ+1] ;
+    char val     [ASCIILINESZ+1] ;
+
+    int  last=0 ;
+    int  len ;
+    int  lineno=0 ;
+    int  errs=0;
+
+    dictionary * dict ;
+
+    if ((in=fopen(ininame, "r"))==NULL) {
+        fprintf(stderr, "iniparser: cannot open %s\n", ininame);
+        return NULL ;
+    }
+
+    dict = dictionary_new(0) ;
+    if (!dict) {
+        fclose(in);
+        return NULL ;
+    }
+
+    memset(line,    0, ASCIILINESZ);
+    memset(section, 0, ASCIILINESZ);
+    memset(key,     0, ASCIILINESZ);
+    memset(val,     0, ASCIILINESZ);
+    last=0 ;
+
+    while (fgets(line+last, ASCIILINESZ-last, in)!=NULL) {
+        lineno++ ;
+        len = (int)strlen(line)-1;
+        if (len==0)
+            continue;
+        /* Safety check against buffer overflows */
+        if (line[len]!='\n') {
+            fprintf(stderr,
+                    "iniparser: input line too long in %s (%d)\n",
+                    ininame,
+                    lineno);
+            dictionary_del(dict);
+            fclose(in);
+            return NULL ;
+        }
+        /* Get rid of \n and spaces at end of line */
+        while ((len>=0) &&
+                ((line[len]=='\n') || (isspace(line[len])))) {
+            line[len]=0 ;
+            len-- ;
+        }
+        /* Detect multi-line */
+        if (line[len]=='\\') {
+            /* Multi-line value */
+            last=len ;
+            continue ;
+        } else {
+            last=0 ;
+        }
+        switch (iniparser_line(line, section, key, val)) {
+            case LINE_EMPTY:
+            case LINE_COMMENT:
+            break ;
+
+            case LINE_SECTION:
+            errs = dictionary_set(dict, section, NULL);
+            break ;
+
+            case LINE_VALUE:
+            sprintf(tmp, "%s:%s", section, key);
+            errs = dictionary_set(dict, tmp, val) ;
+            break ;
+
+            case LINE_ERROR:
+            fprintf(stderr, "iniparser: syntax error in %s (%d):\n",
+                    ininame,
+                    lineno);
+            fprintf(stderr, "-> %s\n", line);
+            errs++ ;
+            break;
+
+            default:
+            break ;
+        }
+        memset(line, 0, ASCIILINESZ);
+        last=0;
+        if (errs<0) {
+            fprintf(stderr, "iniparser: memory allocation failure\n");
+            break ;
+        }
+    }
+    if (errs) {
+        dictionary_del(dict);
+        dict = NULL ;
+    }
+    fclose(in);
+    return dict ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Free all memory associated to an ini dictionary
+  @param    d Dictionary to free
+  @return   void
+
+  Free all memory associated to an ini dictionary.
+  It is mandatory to call this function before the dictionary object
+  gets out of the current context.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_freedict(dictionary * d)
+{
+    dictionary_del(d);
+}
+
+/* vim: set ts=4 et sw=4 tw=75 */
--- a/deps/SZ/sz/src/pastri.c
+++ b/deps/SZ/sz/src/pastri.c
@ -0,0 +1,87 @@
+#include "pastri.h"
+#include "pastriD.h"
+#include "pastriF.h"
+
+void SZ_pastriReadParameters(char paramsFilename[512],pastri_params *paramsPtr){
+  FILE *paramsF;
+  paramsF=fopen(paramsFilename,"r");
+  
+  if(paramsF==NULL){
+    printf("ERROR: Parameters file cannot be opened.\n");
+    printf("Filename: %s\n",paramsFilename);
+    assert(0);
+  }
+  
+  fscanf(paramsF,"%d %d %d %d %lf %d %d",&paramsPtr->bf[0],&paramsPtr->bf[1],&paramsPtr->bf[2],&paramsPtr->bf[3],&paramsPtr->originalEb,&paramsPtr->dataSize,&paramsPtr->numBlocks);
+  //printf("Params: %d %d %d %d %.3e %d\n",paramsPtr->bf[0],paramsPtr->bf[1],paramsPtr->bf[2],paramsPtr->bf[3],paramsPtr->originalEb,paramsPtr->numBlocks);
+  fclose(paramsF);
+}
+
+void SZ_pastriPreprocessParameters(pastri_params *p){
+  //Preprocess by calculating some pastri_params:
+  //Calculate sbSize, sbNum, etc.:
+  p->idxRange[0]=(p->bf[0]+1)*(p->bf[0]+2)/2;
+  p->idxRange[1]=(p->bf[1]+1)*(p->bf[1]+2)/2;
+  p->idxRange[2]=(p->bf[2]+1)*(p->bf[2]+2)/2;
+  p->idxRange[3]=(p->bf[3]+1)*(p->bf[3]+2)/2;
+  p->sbSize=p->idxRange[2]*p->idxRange[3];
+  p->sbNum=p->idxRange[0]*p->idxRange[1];
+  p->bSize=p->sbSize*p->sbNum;
+  p->usedEb=p->originalEb*0.999;  //This is needed just to eliminate some rounding errors. It has almost no effect on compression rate/ratios.
+}
+
+void SZ_pastriCompressBatch(pastri_params *p,unsigned char *originalBuf, unsigned char** compressedBufP,size_t *compressedBytes){
+  (*compressedBufP) = (unsigned char*)calloc(p->numBlocks*p->bSize*p->dataSize,sizeof(char));
+  int bytes; //bytes for this block
+  int i;
+  size_t bytePos=0; //Current byte pos in the outBuf
+  
+  memcpy(*compressedBufP, p, sizeof(pastri_params));
+  bytePos+=sizeof(pastri_params);
+  
+  for(i=0;i<p->numBlocks;i++){
+    if(p->dataSize==8){
+      pastri_double_Compress(originalBuf + (i*p->bSize*p->dataSize),p,(*compressedBufP) + bytePos,&bytes);
+    }else if(p->dataSize==4){
+      pastri_float_Compress(originalBuf + (i*p->bSize*p->dataSize),p,(*compressedBufP) + bytePos,&bytes);
+    }
+    bytePos+=bytes;
+    //printf("bytes:%d\n",bytes);
+  }
+  *compressedBytes=bytePos;
+  //printf("totalBytesWritten:%d\n",*compressedBytes);
+}
+
+void SZ_pastriDecompressBatch(unsigned char*compressedBuf, pastri_params *p, unsigned char** decompressedBufP ,size_t *decompressedBytes){
+  int bytePos=0; //Current byte pos in the outBuf 
+  memcpy(p, compressedBuf, sizeof(pastri_params));
+  bytePos+=sizeof(pastri_params);	
+	
+  (*decompressedBufP) = (unsigned char*)malloc(p->numBlocks*p->bSize*p->dataSize*sizeof(char)); 
+  int bytes; //bytes for this block
+  int i;
+  
+  for(i=0;i<p->numBlocks;i++){
+    if(p->dataSize==8){
+      pastri_double_Decompress(compressedBuf + bytePos,p->dataSize,p,(*decompressedBufP) + (i*p->bSize*p->dataSize),&bytes);
+    }else if(p->dataSize==4){
+      pastri_float_Decompress(compressedBuf + bytePos,p->dataSize,p,(*decompressedBufP) + (i*p->bSize*p->dataSize),&bytes);
+    }
+          
+    bytePos += bytes;
+    //printf("bytes:%d\n",bytes);
+  }
+  //printf("totalBytesRead:%d\n",bytePos);
+  *decompressedBytes=p->numBlocks*p->bSize*p->dataSize;
+}
+
+void SZ_pastriCheckBatch(pastri_params *p,unsigned char*originalBuf,unsigned char*decompressedBuf){        
+  int i;
+  for(i=0;i<p->numBlocks;i++){
+    if(p->dataSize==8){
+      pastri_double_Check(originalBuf+(i*p->bSize*p->dataSize),p->dataSize,decompressedBuf+(i*p->bSize*p->dataSize),p);
+    }else if(p->dataSize==4){
+      pastri_float_Check(originalBuf+(i*p->bSize*p->dataSize),p->dataSize,decompressedBuf+(i*p->bSize*p->dataSize),p);
+    }
+  }
+}
--- a/deps/SZ/sz/src/rw.c
+++ b/deps/SZ/sz/src/rw.c
--- a/deps/SZ/sz/src/rw_interface.F90
+++ b/deps/SZ/sz/src/rw_interface.F90
@ -0,0 +1,205 @@
+!  @file   sdc_interface.F90
+!  @author Sheng Di (disheng222@gmail.com)
+!  @date   Aug., 2014
+!  @ Mathematics and Computer Science (MCS)
+!  @ Argonne National Laboratory, Lemont, USA.
+!  @brief  The key Fortran binding file to connect C language and Fortran (Fortran part)
+
+
+MODULE RW
+	use :: ISO_C_BINDING
+
+	INTERFACE writeData
+		MODULE PROCEDURE WriteData_inBinary_d1_INTEGER_K1
+		MODULE PROCEDURE WriteData_inBinary_d1_REAL_K4
+		MODULE PROCEDURE WriteData_inBinary_d2_REAL_K4
+		MODULE PROCEDURE WriteData_inBinary_d3_REAL_K4
+		MODULE PROCEDURE WriteData_inBinary_d4_REAL_K4
+		MODULE PROCEDURE WriteData_inBinary_d5_REAL_K4
+		MODULE PROCEDURE WriteData_inBinary_d1_REAL_K8
+		MODULE PROCEDURE WriteData_inBinary_d2_REAL_K8
+		MODULE PROCEDURE WriteData_inBinary_d3_REAL_K8
+		MODULE PROCEDURE WriteData_inBinary_d4_REAL_K8
+		MODULE PROCEDURE WriteData_inBinary_d5_REAL_K8
+	END INTERFACE writeData
+
+	INTERFACE readData
+		MODULE PROCEDURE readByteData
+		MODULE PROCEDURE readFloatData
+		MODULE PROCEDURE readDoubleData
+	END INTERFACE readData
+
+	CONTAINS
+
+	!Bytes here could be an "allocatable" array, so it requires an extra "byteLength" io indicate the length (can't use size(Bytes))
+	SUBROUTINE WriteData_inBinary_d1_INTEGER_K1(Bytes, byteLength, FILE_PATH)
+		implicit none
+		INTEGER(KIND=1), DIMENSION(:) :: Bytes
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER(KIND=C_SIZE_T) :: byteLength
+
+		CALL writeByteFile(Bytes, byteLength, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d1_INTEGER_K1
+
+	SUBROUTINE WriteData_inBinary_d1_REAL_K4(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=4), DIMENSION(:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeFloatFile(VAR, nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d1_REAL_K4
+
+	SUBROUTINE WriteData_inBinary_d2_REAL_K4(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeFloatFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d2_REAL_K4
+
+	SUBROUTINE WriteData_inBinary_d3_REAL_K4(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeFloatFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d3_REAL_K4
+
+	SUBROUTINE WriteData_inBinary_d4_REAL_K4(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeFloatFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d4_REAL_K4
+
+	SUBROUTINE WriteData_inBinary_d5_REAL_K4(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeFloatFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d5_REAL_K4
+
+!write data in binary for K8 data
+
+	SUBROUTINE WriteData_inBinary_d1_REAL_K8(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=8), DIMENSION(:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeDoubleFile(VAR, nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d1_REAL_K8
+
+	SUBROUTINE WriteData_inBinary_d2_REAL_K8(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeDoubleFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d2_REAL_K8
+
+	SUBROUTINE WriteData_inBinary_d3_REAL_K8(VAR, FILE_PATH)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeDoubleFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d3_REAL_K8
+
+	SUBROUTINE WriteData_inBinary_d4_REAL_K8(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeDoubleFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d4_REAL_K8
+
+	SUBROUTINE WriteData_inBinary_d5_REAL_K8(VAR, nbEle, FILE_PATH)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:,:) :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER :: nbEle
+
+		CALL writeDoubleFile(RESHAPE(VAR,(/nbEle/)), nbEle, FILE_PATH, len(trim(FILE_PATH)))
+	END SUBROUTINE WriteData_inBinary_d5_REAL_K8
+
+!Check file size
+	SUBROUTINE checkFileSize(FILE_PATH, BYTESIZE)
+		implicit none
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER(kind=C_SIZE_T) :: BYTESIZE
+
+		CALL checkFileSizeC(FILE_PATH, len(trim(FILE_PATH)), BYTESIZE)
+	END SUBROUTINE checkFileSize
+
+!Read data
+	SUBROUTINE readByteData(FILE_PATH, Bytes, outSize)
+		implicit none
+		INTEGER(KIND=1), DIMENSION(:), allocatable :: temp
+		INTEGER(KIND=1), DIMENSION(:), allocatable :: Bytes
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER(kind=C_SIZE_T) :: COUNTER
+		INTEGER(kind=C_SIZE_T), intent(out) :: outSize !in bytes
+		
+		CALL checkFileSize(FILE_PATH, outSize)
+		allocate(temp(outSize))
+
+		CALL readByteFile(FILE_PATH, len(trim(FILE_PATH)), temp, outSize)
+		allocate(Bytes(outSize))
+		DO COUNTER=1,outSize,1
+			Bytes(COUNTER) = temp(COUNTER)
+		END DO
+		deallocate(temp)
+	END SUBROUTINE readByteData
+
+	SUBROUTINE readFloatData(FILE_PATH, VAR, nbEle)
+		implicit none
+		REAL(KIND=4), DIMENSION(:), allocatable :: temp
+		REAL(KIND=4), DIMENSION(:), allocatable :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER(kind=C_SIZE_T) :: COUNTER, fileSize
+		INTEGER(kind=C_SIZE_T), intent(out) :: nbEle
+
+		CALL checkFileSize(FILE_PATH, fileSize)
+		nbEle = fileSize/4
+		allocate(temp(nbEle))
+		
+		CALL readFloatFile(FILE_PATH, len(trim(FILE_PATH)), temp, nbEle)
+		allocate(VAR(nbEle))
+		DO COUNTER=1,fileSize,1
+			VAR(COUNTER) = temp(COUNTER)
+		END DO		
+		deallocate(temp)
+	END SUBROUTINE readFloatData
+
+	SUBROUTINE readDoubleData(FILE_PATH, VAR, nbEle)
+		implicit none
+		REAL(KIND=8), DIMENSION(:), allocatable :: temp
+		REAL(KIND=8), DIMENSION(:), allocatable :: VAR
+		CHARACTER(LEN=*) :: FILE_PATH
+		INTEGER(kind=C_SIZE_T) :: COUNTER, fileSize
+		INTEGER(kind=C_SIZE_T), intent(out) :: nbEle
+
+		CALL checkFileSize(FILE_PATH, fileSize)
+		nbEle = fileSize/8
+		allocate(temp(nbEle))
+	
+		CALL readDoubleFile(FILE_PATH, len(trim(FILE_PATH)), temp, nbEle)
+		allocate(VAR(nbEle))
+		DO COUNTER=1,fileSize,1
+			VAR(COUNTER) = temp(COUNTER)
+		END DO		
+		deallocate(temp)		
+	END SUBROUTINE readDoubleData
+
+END MODULE RW
--- a/deps/SZ/sz/src/rwf.c
+++ b/deps/SZ/sz/src/rwf.c
@ -0,0 +1,96 @@
+/**
+ *  @file rw.c
+ *  @author Sheng Di
+ *  @date April, 2015
+ *  @brief io interface for fortrance
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "rw.h"
+
+void checkfilesizec_(char *srcFilePath, int *len, size_t *filesize)
+{
+	int i; 
+	int status;
+	char s[*len+1];
+	for(i=0;i<*len;i++)
+		s[i]=srcFilePath[i];
+	s[*len]='\0';
+	*filesize = checkFileSize(s, &status);
+}
+
+void readbytefile_(char *srcFilePath, int *len, unsigned char *bytes, size_t *byteLength)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=srcFilePath[i];
+    s[*len]='\0';
+    unsigned char *tmp_bytes = readByteData(s, byteLength, &ierr);
+    memcpy(bytes, tmp_bytes, *byteLength);
+    free(tmp_bytes);
+}
+
+void readdoublefile_(char *srcFilePath, int *len, double *data, size_t *nbEle)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=srcFilePath[i];
+    s[*len]='\0';	
+	double *tmp_data = readDoubleData(s, nbEle, &ierr);
+	memcpy(data, tmp_data, *nbEle);
+	free(tmp_data);
+}
+
+void readfloatfile_(char *srcFilePath, int *len, float *data, size_t *nbEle)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=srcFilePath[i];
+    s[*len]='\0';
+	float *tmp_data = readFloatData(s, nbEle, &ierr);
+	memcpy(data, tmp_data, *nbEle);
+	free(tmp_data);
+}
+
+void writebytefile_(unsigned char *bytes, size_t *byteLength, char *tgtFilePath, int *len)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=tgtFilePath[i];
+    s[*len]='\0';
+	writeByteData(bytes, *byteLength, s, &ierr);
+}
+
+void writedoublefile_(double *data, size_t *nbEle, char *tgtFilePath, int *len)
+{
+	size_t i;
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=tgtFilePath[i];
+    s[*len]='\0';	
+	writeDoubleData(data, *nbEle, s, &ierr);
+}
+
+void writefloatfile_(float *data, size_t *nbEle, char *tgtFilePath, int *len)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=tgtFilePath[i];
+    s[*len]='\0';
+	writeFloatData(data, *nbEle, s, &ierr);
+}
--- a/deps/SZ/sz/src/sz.c
+++ b/deps/SZ/sz/src/sz.c
--- a/deps/SZ/sz/src/sz_double.c
+++ b/deps/SZ/sz/src/sz_double.c
--- a/Show More
+++ b/Show More