diff --git a/GNUmakefile b/GNUmakefile
index 1aaa223bd76359296fb017e6adc56ae1180464c0..a015b7d50312eeb9fcdec21885800b4ee3adc057 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -63,11 +63,11 @@ BLOSC_CPPFLAGS := \
 BASE_CPPFLAGS := $(BLOSC_CPPFLAGS) -I. -D__STDC_CONSTANT_MACROS
 
 FEDIR = frontend
-FE_CFLAGS := -g -fPIC -O3 -fopenmp
+FE_CFLAGS := -g -fPIC -O3 -fopenmp -std=gnu99
 FE_CPPFLAGS := $(BASE_CPPFLAGS) -Ithirdparty/sqlite -DGENERICIO_NO_MPI
 
 MPIDIR = mpi
-MPI_CFLAGS := -g -O3 -fopenmp
+MPI_CFLAGS := -g -O3 -fopenmp -std=gnu99
 MPI_CPPFLAGS := $(BASE_CPPFLAGS)
 
 $(FEDIR):
@@ -169,7 +169,11 @@ BLOSC_O := \
 	thirdparty/SZ/sz/src/sz_float_pwr.o \
 	thirdparty/SZ/sz/src/sz_double_pwr.o \
 	thirdparty/SZ/sz/src/szd_float_pwr.o \
-	thirdparty/SZ/sz/src/szd_double_pwr.o
+	thirdparty/SZ/sz/src/szd_double_pwr.o \
+	thirdparty/SZ/sz/src/sz_double_ts.o \
+	thirdparty/SZ/sz/src/sz_float_ts.o \
+	thirdparty/SZ/sz/src/szd_double_ts.o \
+	thirdparty/SZ/sz/src/szd_float_ts.o
 
 FE_BLOSC_O := $(addprefix $(FEDIR)/,$(BLOSC_O))
 
diff --git a/GenericIO.cxx b/GenericIO.cxx
index c76b63b662b0154428e1480b7c129307687c1ac3..14f030f8824a8e28e47e35398c8fe1035f292f87 100644
--- a/GenericIO.cxx
+++ b/GenericIO.cxx
@@ -515,6 +515,11 @@ void GenericIO::write() {
          blosc_initialized = true;
        }
 
+       if (!sz_initialized) {
+         SZ_Init(NULL);
+         sz_initialized = true;
+       }
+
 #ifdef _OPENMP
        blosc_set_nthreads(omp_get_max_threads());
   }
diff --git a/thirdparty/SZ/COPYRIGHT.txt b/thirdparty/SZ/COPYRIGHT.txt
new file mode 100644
index 0000000000000000000000000000000000000000..de90efe3c09def6f7a3bdd1d0dac7a0b11d82716
--- /dev/null
+++ b/thirdparty/SZ/COPYRIGHT.txt
@@ -0,0 +1,32 @@
+Copyright Â© 2016 , UChicago Argonne, LLC
+All Rights Reserved
+[SZ, Version 1.3]
+Sheng Di
+Dingwen Tao
+Franck Cappello
+Argonne National Laboratory
+
+OPEN SOURCE LICENSE (license number: SF-16-105)
+ 
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+ 
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.  Software changes, modifications, or derivative works, should be noted with comments and the author and organization's name.
+ 
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+ 
+3. Neither the names of UChicago Argonne, LLC or the Department of Energy nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+ 
+4. The software and the end-user documentation included with the redistribution, if any, must include the following acknowledgment:
+ 
+   "This product includes software produced by UChicago Argonne, LLC under Contract No. DE-AC02-06CH11357 with the Department of Energy."
+ 
+******************************************************************************************************
+DISCLAIMER
+ 
+THE SOFTWARE IS SUPPLIED "AS IS" WITHOUT WARRANTY OF ANY KIND.
+ 
+NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT OF ENERGY, NOR UCHICAGO ARGONNE, LLC, NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, DATA, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+ 
+***************************************************************************************************
+
+Contact: Sheng Di (sdi1@anl.gov), Franck Cappello(cappello@mcs.anl.gov)
diff --git a/thirdparty/SZ/sz/include/ByteToolkit.h b/thirdparty/SZ/sz/include/ByteToolkit.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d0ca485ce42b96b9064443238ce1fdb74acfd38
--- /dev/null
+++ b/thirdparty/SZ/sz/include/ByteToolkit.h
@@ -0,0 +1,79 @@
+/**
+ *  @file ByteToolkit.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the ByteToolkit.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _ByteToolkit_H
+#define _ByteToolkit_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+//ByteToolkit.c
+
+unsigned short bytesToUInt16_bigEndian(unsigned char* bytes);
+unsigned int bytesToUInt32_bigEndian(unsigned char* bytes);
+unsigned long bytesToUInt64_bigEndian(unsigned char* b);
+
+short bytesToInt16_bigEndian(unsigned char* bytes);
+int bytesToInt32_bigEndian(unsigned char* bytes);
+long bytesToInt64_bigEndian(unsigned char* b);
+int bytesToInt_bigEndian(unsigned char* bytes);
+
+void intToBytes_bigEndian(unsigned char *b, unsigned int num);
+
+void int64ToBytes_bigEndian(unsigned char *b, uint64_t num);
+void int32ToBytes_bigEndian(unsigned char *b, uint32_t num);
+void int16ToBytes_bigEndian(unsigned char *b, uint16_t num);
+
+long bytesToLong_bigEndian(unsigned char* b);
+void longToBytes_bigEndian(unsigned char *b, unsigned long num);
+long doubleToOSEndianLong(double value);
+int floatToOSEndianInt(float value);
+short getExponent_float(float value);
+short getPrecisionReqLength_float(float precision);
+short getExponent_double(double value);
+short getPrecisionReqLength_double(double precision);
+unsigned char numberOfLeadingZeros_Int(int i);
+unsigned char numberOfLeadingZeros_Long(long i);
+unsigned char getLeadingNumbers_Int(int v1, int v2);
+unsigned char getLeadingNumbers_Long(long v1, long v2);
+short bytesToShort(unsigned char* bytes);
+void shortToBytes(unsigned char* b, short value);
+int bytesToInt(unsigned char* bytes);
+long bytesToLong(unsigned char* bytes);
+float bytesToFloat(unsigned char* bytes);
+void floatToBytes(unsigned char *b, float num);
+double bytesToDouble(unsigned char* bytes);
+void doubleToBytes(unsigned char *b, double num);
+int extractBytes(unsigned char* byteArray, size_t k, int validLength);
+int getMaskRightCode(int m);
+int getLeftMovingCode(int kMod8);
+int getRightMovingSteps(int kMod8, int resiBitLength);
+int getRightMovingCode(int kMod8, int resiBitLength);
+short* convertByteDataToShortArray(unsigned char* bytes, size_t byteLength);
+unsigned short* convertByteDataToUShortArray(unsigned char* bytes, size_t byteLength);
+
+void convertShortArrayToBytes(short* states, size_t stateLength, unsigned char* bytes);
+void convertUShortArrayToBytes(unsigned short* states, size_t stateLength, unsigned char* bytes);
+void convertIntArrayToBytes(int* states, size_t stateLength, unsigned char* bytes);
+void convertUIntArrayToBytes(unsigned int* states, size_t stateLength, unsigned char* bytes);
+void convertLongArrayToBytes(int64_t* states, size_t stateLength, unsigned char* bytes);
+void convertULongArrayToBytes(uint64_t* states, size_t stateLength, unsigned char* bytes);
+
+size_t bytesToSize(unsigned char* bytes);
+void sizeToBytes(unsigned char* outBytes, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _ByteToolkit_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/CompressElement.h b/thirdparty/SZ/sz/include/CompressElement.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1538b2316a9dce11bafd6bd84770a7bbbd69a33
--- /dev/null
+++ b/thirdparty/SZ/sz/include/CompressElement.h
@@ -0,0 +1,75 @@
+/**
+ *  @file CompressElement.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Compress Elements such as DoubleCompressELement.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdint.h>
+
+#ifndef _CompressElement_H
+#define _CompressElement_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct DoubleValueCompressElement
+{
+	double data;
+	long curValue;
+	unsigned char curBytes[8]; //big_endian
+	int reqBytesLength;
+	int resiBitsLength;
+} DoubleValueCompressElement;
+
+typedef struct FloatValueCompressElement
+{
+	float data;
+	int curValue;
+	unsigned char curBytes[4]; //big_endian
+	int reqBytesLength;
+	int resiBitsLength;
+} FloatValueCompressElement;
+
+typedef struct LossyCompressionElement
+{
+	int leadingZeroBytes; //0,1,2,or 3
+	unsigned char integerMidBytes[8];
+	int integerMidBytes_Length; //they are mid_bits actually
+	//char curBytes[8];
+	//int curBytes_Length; //4 for single_precision or 8 for double_precision	
+	int resMidBitsLength;
+	int residualMidBits;
+} LossyCompressionElement;
+
+char* decompressGroupIDArray(unsigned char* bytes, size_t dataLength);
+
+short computeGroupNum_float(float value);
+short computeGroupNum_double(double value);
+
+void listAdd_double(double last3CmprsData[3], double value);
+void listAdd_float(float last3CmprsData[3], float value);
+void listAdd_int(int64_t last3CmprsData[3], int64_t value);
+void listAdd_float_group(float *groups, int *flags, char groupNum, float oriValue, float decValue, char* curGroupID);
+void listAdd_double_group(double *groups, int *flags, char groupNum, double oriValue, double decValue, char* curGroupID);
+
+int validPrediction_double(double minErr, double precision);
+int validPrediction_float(float minErr, float precision);
+double* generateGroupErrBounds(int errorBoundMode, double realPrecision, double pwrErrBound);
+int generateGroupMaxIntervalCount(double* groupErrBounds);
+
+void new_LossyCompressionElement(LossyCompressionElement *lce, int leadingNum, unsigned char* intMidBytes, 
+		int intMidBytes_Length, int resiMidBitsLength, int resiBits);
+void updateLossyCompElement_Double(unsigned char* curBytes, unsigned char* preBytes, 
+		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce);
+void updateLossyCompElement_Float(unsigned char* curBytes, unsigned char* preBytes, 
+		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _CompressElement_H  ----- */
diff --git a/thirdparty/SZ/sz/include/DynamicByteArray.h b/thirdparty/SZ/sz/include/DynamicByteArray.h
new file mode 100644
index 0000000000000000000000000000000000000000..717097940fc9f7772382d7b7ebc8934b697df34b
--- /dev/null
+++ b/thirdparty/SZ/sz/include/DynamicByteArray.h
@@ -0,0 +1,36 @@
+/**
+ *  @file DynamicByteArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Byte Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicByteArray_H
+#define _DynamicByteArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicByteArray
+{	
+	unsigned char* array;
+	size_t size;
+	size_t capacity;
+} DynamicByteArray;
+
+void new_DBA(DynamicByteArray **dba, size_t cap);
+void convertDBAtoBytes(DynamicByteArray *dba, unsigned char** bytes);
+void free_DBA(DynamicByteArray *dba);
+unsigned char getDBA_Data(DynamicByteArray *dba, size_t pos);
+void addDBA_Data(DynamicByteArray *dba, unsigned char value);
+void memcpyDBA_Data(DynamicByteArray *dba, unsigned char* data, size_t length);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicByteArray_H  ----- */
diff --git a/thirdparty/SZ/sz/include/DynamicDoubleArray.h b/thirdparty/SZ/sz/include/DynamicDoubleArray.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a3ef4b6a4d74fa2a54f6b6ea5ceb82b2bed6e53
--- /dev/null
+++ b/thirdparty/SZ/sz/include/DynamicDoubleArray.h
@@ -0,0 +1,36 @@
+/**
+ *  @file DynamicDoubleArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Double Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicDoubleArray_H
+#define _DynamicDoubleArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+typedef struct DynamicDoubleArray
+{	
+	double* array;
+	size_t size;
+	double capacity;
+} DynamicDoubleArray;
+
+void new_DDA(DynamicDoubleArray **dda, size_t cap);
+void convertDDAtoDoubles(DynamicDoubleArray *dba, double **data);
+void free_DDA(DynamicDoubleArray *dda);
+double getDDA_Data(DynamicDoubleArray *dda, size_t pos);
+void addDDA_Data(DynamicDoubleArray *dda, double value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicDoubleArray_H  ----- */
diff --git a/thirdparty/SZ/sz/include/DynamicFloatArray.h b/thirdparty/SZ/sz/include/DynamicFloatArray.h
new file mode 100644
index 0000000000000000000000000000000000000000..2770f7860bc1da9eab4a478599537e7c29ec4a7f
--- /dev/null
+++ b/thirdparty/SZ/sz/include/DynamicFloatArray.h
@@ -0,0 +1,35 @@
+/**
+ *  @file DynamicFloatArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Float Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicFloatArray_H
+#define _DynamicFloatArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicFloatArray
+{	
+	float* array;
+	size_t size;
+	size_t capacity;
+} DynamicFloatArray;
+
+void new_DFA(DynamicFloatArray **dfa, size_t cap);
+void convertDFAtoFloats(DynamicFloatArray *dfa, float **data);
+void free_DFA(DynamicFloatArray *dfa);
+float getDFA_Data(DynamicFloatArray *dfa, size_t pos);
+void addDFA_Data(DynamicFloatArray *dfa, float value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicFloatArray_H  ----- */
diff --git a/thirdparty/SZ/sz/include/DynamicIntArray.h b/thirdparty/SZ/sz/include/DynamicIntArray.h
new file mode 100644
index 0000000000000000000000000000000000000000..c821c5712728ff816ad0643d9bd1a2bbfaff8e85
--- /dev/null
+++ b/thirdparty/SZ/sz/include/DynamicIntArray.h
@@ -0,0 +1,35 @@
+/**
+ *  @file DynamicIntArray.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for Dynamic Int Array.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DynamicIntArray_H
+#define _DynamicIntArray_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+typedef struct DynamicIntArray
+{	
+	unsigned char* array; //char* (one byte) is enough, don't have to be int*
+	size_t size;
+	size_t capacity;
+} DynamicIntArray;
+
+void new_DIA(DynamicIntArray **dia, size_t cap);
+void convertDIAtoInts(DynamicIntArray *dia, unsigned char **data);
+void free_DIA(DynamicIntArray *dia);
+int getDIA_Data(DynamicIntArray *dia, size_t pos);
+void addDIA_Data(DynamicIntArray *dia, int value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DynamicIntArray_H  ----- */
diff --git a/thirdparty/SZ/sz/include/Huffman.h b/thirdparty/SZ/sz/include/Huffman.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f87cad6c62a52fdd520ab3ec3866d8aed980d19
--- /dev/null
+++ b/thirdparty/SZ/sz/include/Huffman.h
@@ -0,0 +1,69 @@
+/**
+ *  @file Huffman.h
+ *  @author Sheng Di
+ *  @date Aug., 2016
+ *  @brief Header file for the exponential segment constructor.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _Huffman_H
+#define _Huffman_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//Note: when changing the following settings, intvCapacity in sz.h should be changed as well.
+//#define allNodes 131072
+//#define stateNum 65536
+
+typedef struct node_t {
+	struct node_t *left, *right;
+	size_t freq;
+	char t; //in_node:0; otherwise:1
+	unsigned int c;
+} *node;
+
+typedef struct HuffmanTree {
+	int stateNum;
+	int allNodes;
+	struct node_t* pool;
+	node *qqq, *qq; //the root node of the HuffmanTree is qq[1]
+	int n_nodes; //n_nodes is for compression
+	int qend; 
+	unsigned long **code;
+	unsigned char *cout;
+	int n_inode; //n_inode is for decompression
+} HuffmanTree;
+
+HuffmanTree* createHuffmanTree(int stateNum);
+HuffmanTree* createDefaultHuffmanTree();
+
+node new_node(HuffmanTree *huffmanTree, size_t freq, unsigned int c, node a, node b);
+node new_node2(HuffmanTree *huffmanTree, unsigned int c, unsigned char t);
+void qinsert(HuffmanTree *huffmanTree, node n);
+node qremove(HuffmanTree *huffmanTree);
+void build_code(HuffmanTree *huffmanTree, node n, int len, unsigned long out1, unsigned long out2);
+void init(HuffmanTree *huffmanTree, int *s, size_t length);
+void encode(HuffmanTree *huffmanTree, int *s, size_t length, unsigned char *out, size_t *outSize);
+void decode(unsigned char *s, size_t targetLength, node t, int *out);
+void pad_tree_uchar(HuffmanTree* huffmanTree, unsigned char* L, unsigned char* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+void pad_tree_ushort(HuffmanTree* huffmanTree, unsigned short* L, unsigned short* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+void pad_tree_uint(HuffmanTree* huffmanTree, unsigned int* L, unsigned int* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+unsigned int convert_HuffTree_to_bytes_anyStates(HuffmanTree* huffmanTree, int nodeCount, unsigned char** out);
+void unpad_tree_uchar(HuffmanTree* huffmanTree, unsigned char* L, unsigned char* R, unsigned int* C, unsigned char *t, unsigned int i, node root);
+void unpad_tree_ushort(HuffmanTree* huffmanTree, unsigned short* L, unsigned short* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+void unpad_tree_uint(HuffmanTree* huffmanTree, unsigned int* L, unsigned int* R, unsigned int* C, unsigned char* t, unsigned int i, node root);
+node reconstruct_HuffTree_from_bytes_anyStates(HuffmanTree *huffmanTree, unsigned char* bytes, int nodeCount);
+
+void encode_withTree(HuffmanTree* huffmanTree, int *s, size_t length, unsigned char **out, size_t *outSize);
+void decode_withTree(HuffmanTree* huffmanTree, unsigned char *s, size_t targetLength, int *out);
+
+void SZ_ReleaseHuffman(HuffmanTree* huffmanTree);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/thirdparty/SZ/sz/include/TightDataPointStorageD.h b/thirdparty/SZ/sz/include/TightDataPointStorageD.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fc5be82efefa902dee98e8f131d0f420daf29f2
--- /dev/null
+++ b/thirdparty/SZ/sz/include/TightDataPointStorageD.h
@@ -0,0 +1,90 @@
+/**
+ *  @file TightDataPointStorageD.h
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Header file for the tight data point storage (TDPS).
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _TightDataPointStorageD_H
+#define _TightDataPointStorageD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TightDataPointStorageD
+{
+	size_t dataSeriesLength;
+	int allSameData;
+	double realPrecision;
+	double medianValue;
+	char reqLength;	
+	char radExpo; //used to compute reqLength based on segmented precisions in "pw_rel_compression"
+
+	int stateNum;
+	int allNodes;
+
+	size_t exactDataNum;
+	double reservedValue;
+	
+	unsigned char* rtypeArray;
+	size_t rtypeArray_size;
+	
+	unsigned char* typeArray; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	size_t typeArray_size;
+	
+	unsigned char* leadNumArray; //its size is exactDataNum/4 (or exactDataNum/4+1)
+	size_t leadNumArray_size;
+	
+	unsigned char* exactMidBytes;
+	size_t exactMidBytes_size;
+	
+	unsigned char* residualMidBits;
+	size_t residualMidBits_size;
+	
+	unsigned int intervals;
+	
+	unsigned char isLossless; //a mark to denote whether it's lossless compression (1 is yes, 0 is no)
+	
+	size_t segment_size;
+	
+	unsigned char* pwrErrBoundBytes;
+	int pwrErrBoundBytes_size;
+} TightDataPointStorageD;
+
+void new_TightDataPointStorageD_Empty(TightDataPointStorageD **self);
+int new_TightDataPointStorageD_fromFlatBytes(TightDataPointStorageD **self, unsigned char* flatBytes, size_t flatBytesLength);
+
+void new_TightDataPointStorageD(TightDataPointStorageD **self, 
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char resiBitLength, 
+		double realPrecision, double medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo);
+
+void new_TightDataPointStorageD2(TightDataPointStorageD **self, 
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char* resiBitLength, size_t resiBitLengthSize,
+		double realPrecision, double medianValue, char reqLength, unsigned int intervals,
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo);
+
+void convertTDPStoBytes_double(TightDataPointStorageD* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte);
+void convertTDPStoBytes_double_reserve(TightDataPointStorageD* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte);
+void convertTDPStoFlatBytes_double(TightDataPointStorageD *tdps, unsigned char** bytes, size_t *size);
+void convertTDPStoFlatBytes_double_args(TightDataPointStorageD *tdps, unsigned char* bytes, size_t *size);
+
+void free_TightDataPointStorageD(TightDataPointStorageD *tdps);
+void free_TightDataPointStorageD2(TightDataPointStorageD *tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _TightDataPointStorageD_H  ----- */
diff --git a/thirdparty/SZ/sz/include/TightDataPointStorageF.h b/thirdparty/SZ/sz/include/TightDataPointStorageF.h
new file mode 100644
index 0000000000000000000000000000000000000000..eca1717b98a3d8fc53b41f09c48e92e2969d1403
--- /dev/null
+++ b/thirdparty/SZ/sz/include/TightDataPointStorageF.h
@@ -0,0 +1,97 @@
+/**
+ *  @file TightDataPointStorageF.h
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief Header file for the tight data point storage (TDPS).
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _TightDataPointStorageF_H
+#define _TightDataPointStorageF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h> 
+
+typedef struct TightDataPointStorageF
+{
+	size_t dataSeriesLength;
+	int allSameData;
+	double realPrecision; //it's used as the pwrErrBoundRatio when errBoundMode==PW_REL
+	float medianValue;
+	char reqLength;
+	char radExpo; //used to compute reqLength based on segmented precisions in "pw_rel_compression"
+	
+	int stateNum;
+	int allNodes;
+	
+	size_t exactDataNum;
+	float reservedValue;
+	
+	unsigned char* rtypeArray;
+	size_t rtypeArray_size;
+	
+	unsigned char* typeArray; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	size_t typeArray_size;
+	
+	unsigned char* leadNumArray; //its size is exactDataNum/4 (or exactDataNum/4+1)
+	size_t leadNumArray_size;
+	
+	unsigned char* exactMidBytes;
+	size_t exactMidBytes_size;
+	
+	unsigned char* residualMidBits;
+	size_t residualMidBits_size;
+	
+	unsigned int intervals; //quantization_intervals
+	
+	unsigned char isLossless; //a mark to denote whether it's lossless compression (1 is yes, 0 is no)
+	
+	size_t segment_size;
+	
+	unsigned char* pwrErrBoundBytes;
+	int pwrErrBoundBytes_size;
+	
+} TightDataPointStorageF;
+
+void new_TightDataPointStorageF_Empty(TightDataPointStorageF **self);
+int new_TightDataPointStorageF_fromFlatBytes(TightDataPointStorageF **self, unsigned char* flatBytes, size_t flatBytesLength);
+
+void new_TightDataPointStorageF(TightDataPointStorageF **self,
+		size_t dataSeriesLength, size_t exactDataNum,
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char resiBitLength,
+		double realPrecision, float medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo);
+
+/**
+ * This function is designed for first-version of the point-wise relative error bound (developed by Sheng Di for TPDS18 paper)
+ * 
+ * */
+void new_TightDataPointStorageF2(TightDataPointStorageF **self,
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char* resiBitLength, size_t resiBitLengthSize, 
+		double realPrecision, float medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo);
+
+void convertTDPStoBytes_float(TightDataPointStorageF* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte);
+void convertTDPStoBytes_float_reserve(TightDataPointStorageF* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte);
+void convertTDPStoFlatBytes_float(TightDataPointStorageF *tdps, unsigned char** bytes, size_t *size);
+void convertTDPStoFlatBytes_float_args(TightDataPointStorageF *tdps, unsigned char* bytes, size_t *size);
+
+void free_TightDataPointStorageF(TightDataPointStorageF *tdps);
+void free_TightDataPointStorageF2(TightDataPointStorageF *tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _TightDataPointStorageF_H  ----- */
diff --git a/thirdparty/SZ/sz/include/TightDataPointStorageI.h b/thirdparty/SZ/sz/include/TightDataPointStorageI.h
new file mode 100644
index 0000000000000000000000000000000000000000..b974060ccd9a042dfc71361392131d71b2b6aac1
--- /dev/null
+++ b/thirdparty/SZ/sz/include/TightDataPointStorageI.h
@@ -0,0 +1,65 @@
+/**
+ *  @file TightDataPointStorageI.h
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2017
+ *  @brief Header file for the tight data point storage (TDPS).
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _TightDataPointStorageI_H
+#define _TightDataPointStorageI_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h> 
+
+typedef struct TightDataPointStorageI
+{
+	size_t dataSeriesLength;
+	int allSameData;
+	double realPrecision; //it's used as the pwrErrBoundRatio when errBoundMode==PW_REL
+	size_t exactDataNum;
+	long minValue;
+	int exactByteSize;
+	int dataTypeSize; //the size of data type, e.g., it's 4 when data type is int32_t
+	
+	int stateNum;
+	int allNodes;
+	
+	unsigned char* typeArray; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	size_t typeArray_size;
+	
+	unsigned char* exactDataBytes;
+	size_t exactDataBytes_size;
+	
+	unsigned int intervals; //quantization_intervals
+	
+	unsigned char isLossless; //a mark to denote whether it's lossless compression (1 is yes, 0 is no)
+
+} TightDataPointStorageI;
+
+int computeRightShiftBits(int exactByteSize, int dataType);
+int convertDataTypeSizeCode(int dataTypeSizeCode);
+int convertDataTypeSize(int dataTypeSize);
+
+void new_TightDataPointStorageI_Empty(TightDataPointStorageI **self);
+int new_TightDataPointStorageI_fromFlatBytes(TightDataPointStorageI **self, unsigned char* flatBytes, size_t flatBytesLength);
+void new_TightDataPointStorageI(TightDataPointStorageI **self,
+		size_t dataSeriesLength, size_t exactDataNum, int byteSize, 
+		int* type, unsigned char* exactDataBytes, size_t exactDataBytes_size,
+		double realPrecision, long minValue, int intervals, int dataType);
+
+void convertTDPStoBytes_int(TightDataPointStorageI* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte);
+void convertTDPStoFlatBytes_int(TightDataPointStorageI *tdps, unsigned char** bytes, size_t *size);
+void convertTDPStoFlatBytes_int_args(TightDataPointStorageI *tdps, unsigned char* bytes, size_t *size);
+void free_TightDataPointStorageI(TightDataPointStorageI *tdps);
+void free_TightDataPointStorageI2(TightDataPointStorageI *tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _TightDataPointStorageI_H  ----- */
diff --git a/thirdparty/SZ/sz/include/TypeManager.h b/thirdparty/SZ/sz/include/TypeManager.h
new file mode 100644
index 0000000000000000000000000000000000000000..6be71f8c862c78aa155b7ad384f5c5f5203f9ef1
--- /dev/null
+++ b/thirdparty/SZ/sz/include/TypeManager.h
@@ -0,0 +1,38 @@
+/**
+ *  @file TypeManager.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the TypeManager.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _TypeManager_H
+#define _TypeManager_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+
+//TypeManager.c
+size_t convertIntArray2ByteArray_fast_1b(unsigned char* intArray, size_t intArrayLength, unsigned char **result);
+void convertByteArray2IntArray_fast_1b(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
+size_t convertIntArray2ByteArray_fast_2b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result);
+void convertByteArray2IntArray_fast_2b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
+size_t convertIntArray2ByteArray_fast_3b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result);
+void convertByteArray2IntArray_fast_3b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray);
+int getLeftMovingSteps(size_t k, unsigned char resiBitLength);
+size_t convertIntArray2ByteArray_fast_dynamic(unsigned char* timeStepType, unsigned char resiBitLength, size_t nbEle, unsigned char **bytes);
+size_t convertIntArray2ByteArray_fast_dynamic2(unsigned char* timeStepType, unsigned char* resiBitLength, size_t resiBitLengthLength, unsigned char **bytes);
+int computeBitNumRequired(size_t dataLength);
+void decompressBitArraybySimpleLZ77(int** result, unsigned char* bytes, size_t bytesLength, size_t totalLength, int validLength);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _TypeManager_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/VarSet.h b/thirdparty/SZ/sz/include/VarSet.h
new file mode 100644
index 0000000000000000000000000000000000000000..c88ec431cb85dd3d250ec1f141cc11c1b885897a
--- /dev/null
+++ b/thirdparty/SZ/sz/include/VarSet.h
@@ -0,0 +1,77 @@
+/**
+ *  @file VarSet.h
+ *  @author Sheng Di
+ *  @date July, 2016
+ *  @brief Header file for the Variable.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _VarSet_H
+#define _VarSet_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+typedef struct sz_multisteps
+{
+	char compressionType;
+	int predictionMode;
+	int lastSnapshotStep; //the previous snapshot step
+	unsigned int currentStep; //current time step of the execution/simulation
+	void* hist_data; //historical data in past time steps
+} sz_multisteps;
+
+typedef struct SZ_Variable
+{
+	char* varName;
+	char compressType; //102 means HZ; 101 means SZ 
+	int dataType; //SZ_FLOAT or SZ_DOUBLE
+	size_t r5;
+	size_t r4;
+	size_t r3;
+	size_t r2;
+	size_t r1;
+	int errBoundMode;
+	double absErrBound;
+	double relBoundRatio;
+	double pwRelBoundRatio;
+	void* data;
+	sz_multisteps *multisteps;
+	unsigned char* compressedBytes;
+	size_t compressedSize;
+	struct SZ_Variable* next;
+} SZ_Variable;
+
+typedef struct SZ_VarSet
+{
+	unsigned short count;
+	struct SZ_Variable *header;
+	struct SZ_Variable *lastVar;
+} SZ_VarSet;
+
+void free_Variable_keepOriginalData(SZ_Variable* v);
+void free_Variable_keepCompressedBytes(SZ_Variable* v);
+void free_Variable_all(SZ_Variable* v);
+void SZ_batchAddVar(char* varName, int dataType, void* data, 
+			int errBoundMode, double absErrBound, double relBoundRatio, double pwRelBoundRatio,
+			size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+int SZ_batchDelVar_vset(SZ_VarSet* vset, char* varName);
+int SZ_batchDelVar(char* varName);
+
+SZ_Variable* SZ_searchVar(char* varName);
+void* SZ_getVarData(char* varName, size_t *r5, size_t *r4, size_t *r3, size_t *r2, size_t *r1);
+
+void free_VarSet_vset(SZ_VarSet *vset, int mode);
+void SZ_freeVarSet(int mode);
+
+void free_multisteps(sz_multisteps* multisteps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _VarSet_H  ----- */
diff --git a/thirdparty/SZ/sz/include/callZlib.h b/thirdparty/SZ/sz/include/callZlib.h
new file mode 100644
index 0000000000000000000000000000000000000000..0622d9809cdcad17eaeacc02942c2810b843a77c
--- /dev/null
+++ b/thirdparty/SZ/sz/include/callZlib.h
@@ -0,0 +1,42 @@
+/**
+ *  @file callZlib.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the callZlib.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _CallZlib_H
+#define _CallZlib_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#define SZ_ZLIB_BUFFER_SIZE 1048576	
+#define SZ_ZLIB_BUFFER_SIZE 65536
+
+#include <stdio.h>
+
+//callZlib.c
+unsigned long zlib_compress(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
+unsigned long zlib_compress2(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
+unsigned long zlib_compress3(unsigned char* data, unsigned long dataLength, unsigned char* compressBytes, int level);
+unsigned long zlib_compress4(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
+unsigned long zlib_compress5(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level);
+
+unsigned long zlib_uncompress4(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long zlib_uncompress5(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long zlib_uncompress(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long zlib_uncompress2(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+unsigned long zlib_uncompress3(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize);
+
+unsigned long zlib_uncompress65536bytes(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _CallZlib_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/conf.h b/thirdparty/SZ/sz/include/conf.h
new file mode 100644
index 0000000000000000000000000000000000000000..401ce260bfeefbe864bf3e7996849a8e252df50e
--- /dev/null
+++ b/thirdparty/SZ/sz/include/conf.h
@@ -0,0 +1,33 @@
+/**
+ *  @file conf.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the conf.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _Conf_H
+#define _Conf_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+//conf.c
+void updateQuantizationInfo(int quant_intervals);
+int SZ_ReadConf(const char* sz_cfgFile);
+int SZ_LoadConf(const char* sz_cfgFile);
+int checkVersion(char* version);
+void initSZ_TSC();
+unsigned int roundUpToPowerOf2(unsigned int base);
+double computeABSErrBoundFromPSNR(double psnr, double threshold, double value_range);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _Conf_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/dataCompression.h b/thirdparty/SZ/sz/include/dataCompression.h
new file mode 100644
index 0000000000000000000000000000000000000000..1eb0f30f06b97412d5e2a9f95887709f75fba668
--- /dev/null
+++ b/thirdparty/SZ/sz/include/dataCompression.h
@@ -0,0 +1,85 @@
+/**
+ *  @file dataCompression.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the dataCompression.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _DataCompression_H
+#define _DataCompression_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "sz.h"
+#include <stdio.h>
+
+#define computeMinMax(data) \
+        for(i=1;i<size;i++)\
+        {\
+                data_ = data[i];\
+                if(min>data_)\
+                        min = data_;\
+                else if(max<data_)\
+                        max = data_;\
+        }\
+
+
+//dataCompression.c
+int computeByteSizePerIntValue(long valueRangeSize);
+long computeRangeSize_int(void* oriData, int dataType, size_t size, int64_t* valueRangeSize);
+double computeRangeSize_double(double* oriData, size_t size, double* valueRangeSize, double* medianValue);
+float computeRangeSize_float(float* oriData, size_t size, float* valueRangeSize, float* medianValue);
+float computeRangeSize_double_subblock(double* oriData, double* valueRangeSize, double* medianValue,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1);
+float computeRangeSize_float_subblock(float* oriData, float* valueRangeSize, float* medianValue,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1);
+double min_d(double a, double b);
+double max_d(double a, double b);
+float min_f(float a, float b);
+float max_f(float a, float b);
+double getRealPrecision_double(double valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+double getRealPrecision_float(float valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+double getRealPrecision_int(long valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status);
+void symTransform_8bytes(unsigned char data[8]);
+void symTransform_2bytes(unsigned char data[2]);
+void symTransform_4bytes(unsigned char data[4]);
+
+void compressInt8Value(int8_t tgtValue, int8_t minValue, int byteSize, unsigned char* bytes);
+void compressInt16Value(int16_t tgtValue, int16_t minValue, int byteSize, unsigned char* bytes);
+void compressInt32Value(int32_t tgtValue, int32_t minValue, int byteSize, unsigned char* bytes);
+void compressInt64Value(int64_t tgtValue, int64_t minValue, int byteSize, unsigned char* bytes);
+
+void compressUInt8Value(uint8_t tgtValue, uint8_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt16Value(uint16_t tgtValue, uint16_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt32Value(uint32_t tgtValue, uint32_t minValue, int byteSize, unsigned char* bytes);
+void compressUInt64Value(uint64_t tgtValue, uint64_t minValue, int byteSize, unsigned char* bytes);
+
+void compressSingleFloatValue(FloatValueCompressElement *vce, float tgtValue, float precision, float medianValue, 
+		int reqLength, int reqBytesLength, int resiBitsLength);
+void compressSingleDoubleValue(DoubleValueCompressElement *vce, double tgtValue, double precision, double medianValue, 
+		int reqLength, int reqBytesLength, int resiBitsLength);
+int compIdenticalLeadingBytesCount_double(unsigned char* preBytes, unsigned char* curBytes);
+int compIdenticalLeadingBytesCount_float(unsigned char* preBytes, unsigned char* curBytes);
+void addExactData(DynamicByteArray *exactMidByteArray, DynamicIntArray *exactLeadNumArray, 
+		DynamicIntArray *resiBitArray, LossyCompressionElement *lce);
+
+int getPredictionCoefficients(int layers, int dimension, int **coeff_array, int *status);
+
+int computeBlockEdgeSize_3D(int segmentSize);
+int computeBlockEdgeSize_2D(int segmentSize);
+int initRandomAccessBytes(unsigned char* raBytes);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _DataCompression_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/dictionary.h b/thirdparty/SZ/sz/include/dictionary.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cf326ade2751b2e759bdaac7d8ac72b5eefcabf
--- /dev/null
+++ b/thirdparty/SZ/sz/include/dictionary.h
@@ -0,0 +1,172 @@
+
+/*-------------------------------------------------------------------------*/
+/**
+   @file    dictionary.h
+   @author  N. Devillard
+   @brief   Implements a dictionary for string variables.
+
+   This module implements a simple dictionary object, i.e. a list
+   of string/string associations. This object is useful to store e.g.
+   informations retrieved from a configuration file (ini files).
+*/
+/*--------------------------------------------------------------------------*/
+
+#ifndef _DICTIONARY_H_
+#define _DICTIONARY_H_
+
+/*---------------------------------------------------------------------------
+                                Includes
+ ---------------------------------------------------------------------------*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/*---------------------------------------------------------------------------
+                                New types
+ ---------------------------------------------------------------------------*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dictionary object
+
+  This object contains a list of string/string associations. Each
+  association is identified by a unique string key. Looking up values
+  in the dictionary is speeded up by the use of a (hopefully collision-free)
+  hash function.
+ */
+/*-------------------------------------------------------------------------*/
+typedef struct _dictionary_ {
+    int             n ;     /** Number of entries in dictionary */
+    int             size ;  /** Storage size */
+    char        **  val ;   /** List of string values */
+    char        **  key ;   /** List of string keys */
+    unsigned     *  hash ;  /** List of hash values for keys */
+} dictionary ;
+
+
+/*---------------------------------------------------------------------------
+                            Function prototypes
+ ---------------------------------------------------------------------------*/
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Compute the hash key for a string.
+  @param    key     Character string to use for key.
+  @return   1 unsigned int on at least 32 bits.
+
+  This hash function has been taken from an Article in Dr Dobbs Journal.
+  This is normally a collision-free function, distributing keys evenly.
+  The key is stored anyway in the struct so that collision can be avoided
+  by comparing the key itself in last resort.
+ */
+/*--------------------------------------------------------------------------*/
+unsigned dictionary_hash(const char * key);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Create a new dictionary object.
+  @param    size    Optional initial size of the dictionary.
+  @return   1 newly allocated dictionary objet.
+
+  This function allocates a new dictionary object of given size and returns
+  it. If you do not know in advance (roughly) the number of entries in the
+  dictionary, give size=0.
+ */
+/*--------------------------------------------------------------------------*/
+dictionary * dictionary_new(int size);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete a dictionary object
+  @param    d   dictionary object to deallocate.
+  @return   void
+
+  Deallocate a dictionary object and all memory associated to it.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_del(dictionary * vd);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get a value from a dictionary.
+  @param    d       dictionary object to search.
+  @param    key     Key to look for in the dictionary.
+  @param    def     Default value to return if key not found.
+  @return   1 pointer to internally allocated character string.
+
+  This function locates a key in a dictionary and returns a pointer to its
+  value, or the passed 'def' pointer if no such key can be found in
+  dictionary. The returned character pointer points to data internal to the
+  dictionary object, you should not try to free it or modify it.
+ */
+/*--------------------------------------------------------------------------*/
+char * dictionary_get(dictionary * d, const char * key, char * def);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Set a value in a dictionary.
+  @param    d       dictionary object to modify.
+  @param    key     Key to modify or add.
+  @param    val     Value to add.
+  @return   int     0 if Ok, anything else otherwise
+
+  If the given key is found in the dictionary, the associated value is
+  replaced by the provided one. If the key cannot be found in the
+  dictionary, it is added to it.
+
+  It is Ok to provide a NULL value for val, but NULL values for the dictionary
+  or the key are considered as errors: the function will return immediately
+  in such a case.
+
+  Notice that if you dictionary_set a variable to NULL, a call to
+  dictionary_get will return a NULL value: the variable will be found, and
+  its value (NULL) is returned. In other words, setting the variable
+  content to NULL is equivalent to deleting the variable from the
+  dictionary. It is not possible (in this implementation) to have a key in
+  the dictionary without value.
+
+  This function returns non-zero in case of failure.
+ */
+/*--------------------------------------------------------------------------*/
+int dictionary_set(dictionary * vd, const char * key, const char * val);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete a key in a dictionary
+  @param    d       dictionary object to modify.
+  @param    key     Key to remove.
+  @return   void
+
+  This function deletes a key in a dictionary. Nothing is done if the
+  key cannot be found.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_unset(dictionary * d, const char * key);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dump a dictionary to an opened file pointer.
+  @param    d   Dictionary to dump
+  @param    f   Opened file pointer.
+  @return   void
+
+  Dumps a dictionary onto an opened file pointer. Key pairs are printed out
+  as @c [Key]=[Value], one per line. It is Ok to provide stdout or stderr as
+  output file pointers.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_dump(dictionary * d, FILE * out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/thirdparty/SZ/sz/include/iniparser.h b/thirdparty/SZ/sz/include/iniparser.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ddb907c2e4af917e82b6503db0c9a00032ad38f
--- /dev/null
+++ b/thirdparty/SZ/sz/include/iniparser.h
@@ -0,0 +1,321 @@
+
+/*-------------------------------------------------------------------------*/
+/**
+   @file    iniparser.h
+   @author  N. Devillard
+   @brief   Parser for ini files.
+*/
+/*--------------------------------------------------------------------------*/
+
+#ifndef _INIPARSER_H_
+#define _INIPARSER_H_
+
+/*---------------------------------------------------------------------------
+                                Includes
+ ---------------------------------------------------------------------------*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * The following #include is necessary on many Unixes but not Linux.
+ * It is not needed for Windows platforms.
+ * Uncomment it if needed.
+ */
+/* #include <unistd.h> */
+
+#include "dictionary.h"
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get number of sections in a dictionary
+  @param    d   Dictionary to examine
+  @return   int Number of sections found in dictionary
+
+  This function returns the number of sections found in a dictionary.
+  The test to recognize sections is done on the string stored in the
+  dictionary: a section name is given as "section" whereas a key is
+  stored as "section:key", thus the test looks for entries that do not
+  contain a colon.
+
+  This clearly fails in the case a section name contains a colon, but
+  this should simply be avoided.
+
+  This function returns -1 in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+
+int iniparser_getnsec(dictionary * d);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get name for section n in a dictionary.
+  @param    d   Dictionary to examine
+  @param    n   Section number (from 0 to nsec-1).
+  @return   Pointer to char string
+
+  This function locates the n-th section in a dictionary and returns
+  its name as a pointer to a string statically allocated inside the
+  dictionary. Do not free or modify the returned string!
+
+  This function returns NULL in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+
+char * iniparser_getsecname(dictionary * d, int n);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Save a dictionary to a loadable ini file
+  @param    d   Dictionary to dump
+  @param    f   Opened file pointer to dump to
+  @return   void
+
+  This function dumps a given dictionary into a loadable ini file.
+  It is Ok to specify @c stderr or @c stdout as output files.
+ */
+/*--------------------------------------------------------------------------*/
+
+void iniparser_dump_ini(dictionary * d, FILE * f);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Save a dictionary section to a loadable ini file
+  @param    d   Dictionary to dump
+  @param    s   Section name of dictionary to dump
+  @param    f   Opened file pointer to dump to
+  @return   void
+
+  This function dumps a given section of a given dictionary into a loadable ini
+  file.  It is Ok to specify @c stderr or @c stdout as output files.
+ */
+/*--------------------------------------------------------------------------*/
+
+void iniparser_dumpsection_ini(dictionary * d, char * s, FILE * f);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dump a dictionary to an opened file pointer.
+  @param    d   Dictionary to dump.
+  @param    f   Opened file pointer to dump to.
+  @return   void
+
+  This function prints out the contents of a dictionary, one element by
+  line, onto the provided file pointer. It is OK to specify @c stderr
+  or @c stdout as output files. This function is meant for debugging
+  purposes mostly.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_dump(dictionary * d, FILE * f);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the number of keys in a section of a dictionary.
+  @param    d   Dictionary to examine
+  @param    s   Section name of dictionary to examine
+  @return   Number of keys in section
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getsecnkeys(dictionary * d, char * s);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the number of keys in a section of a dictionary.
+  @param    d   Dictionary to examine
+  @param    s   Section name of dictionary to examine
+  @return   pointer to statically allocated character strings
+
+  This function queries a dictionary and finds all keys in a given section.
+  Each pointer in the returned char pointer-to-pointer is pointing to
+  a string allocated in the dictionary; do not free or modify them.
+
+  This function returns NULL in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+char ** iniparser_getseckeys(dictionary * d, char * s);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key
+  @param    d       Dictionary to search
+  @param    key     Key string to look for
+  @param    def     Default value to return if key not found.
+  @return   pointer to statically allocated character string
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the pointer passed as 'def' is returned.
+  The returned char pointer is pointing to a string allocated in
+  the dictionary, do not free or modify it.
+ */
+/*--------------------------------------------------------------------------*/
+char * iniparser_getstring(dictionary * d, const char * key, char * def);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to an int
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   integer
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+
+  Supported values for integers include the usual C notation
+  so decimal, octal (starting with 0) and hexadecimal (starting with 0x)
+  are supported. Examples:
+
+  - "42"      ->  42
+  - "042"     ->  34 (octal -> decimal)
+  - "0x42"    ->  66 (hexa  -> decimal)
+
+  Warning: the conversion may overflow in various ways. Conversion is
+  totally outsourced to strtol(), see the associated man page for overflow
+  handling.
+
+  Credits: Thanks to A. Becker for suggesting strtol()
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getint(dictionary * d, const char * key, int notfound);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a long
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   long
+
+  Credits: This function bases completely on int iniparser_getint and was
+  slightly modified to return long instead of int.
+ */
+/*--------------------------------------------------------------------------*/
+long iniparser_getlint(dictionary * d, const char * key, int notfound);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a double
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   double
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+ */
+/*--------------------------------------------------------------------------*/
+double iniparser_getdouble(dictionary * d, const char * key, double notfound);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a boolean
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   integer
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+
+  A true boolean is found if one of the following is matched:
+
+  - A string starting with 'y'
+  - A string starting with 'Y'
+  - A string starting with 't'
+  - A string starting with 'T'
+  - A string starting with '1'
+
+  A false boolean is found if one of the following is matched:
+
+  - A string starting with 'n'
+  - A string starting with 'N'
+  - A string starting with 'f'
+  - A string starting with 'F'
+  - A string starting with '0'
+
+  The notfound value returned if no boolean is identified, does not
+  necessarily have to be 0 or 1.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getboolean(dictionary * d, const char * key, int notfound);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Set an entry in a dictionary.
+  @param    ini     Dictionary to modify.
+  @param    entry   Entry to modify (entry name)
+  @param    val     New value to associate to the entry.
+  @return   int 0 if Ok, -1 otherwise.
+
+  If the given entry can be found in the dictionary, it is modified to
+  contain the provided value. If it cannot be found, -1 is returned.
+  It is Ok to set val to NULL.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_set(dictionary * ini, const char * entry, const char * val);
+
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete an entry in a dictionary
+  @param    ini     Dictionary to modify
+  @param    entry   Entry to delete (entry name)
+  @return   void
+
+  If the given entry can be found, it is deleted from the dictionary.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_unset(dictionary * ini, const char * entry);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Finds out if a given entry exists in a dictionary
+  @param    ini     Dictionary to search
+  @param    entry   Name of the entry to look for
+  @return   integer 1 if entry exists, 0 otherwise
+
+  Finds out if a given entry exists in the dictionary. Since sections
+  are stored as keys with NULL associated values, this is the only way
+  of querying for the presence of sections in a dictionary.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_find_entry(dictionary * ini, const char * entry) ;
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Parse an ini file and return an allocated dictionary object
+  @param    ininame Name of the ini file to read.
+  @return   Pointer to newly allocated dictionary
+
+  This is the parser for ini files. This function is called, providing
+  the name of the file to be read. It returns a dictionary object that
+  should not be accessed directly, but through accessor functions
+  instead.
+
+  The returned dictionary must be freed using iniparser_freedict().
+ */
+/*--------------------------------------------------------------------------*/
+dictionary * iniparser_load(const char * ininame);
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Free all memory associated to an ini dictionary
+  @param    d Dictionary to free
+  @return   void
+
+  Free all memory associated to an ini dictionary.
+  It is mandatory to call this function before the dictionary object
+  gets out of the current context.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_freedict(dictionary * d);
+
+#endif
diff --git a/thirdparty/SZ/sz/include/pastri.h b/thirdparty/SZ/sz/include/pastri.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5f2e90ddef1084ff262f88de1556967235913b2
--- /dev/null
+++ b/thirdparty/SZ/sz/include/pastri.h
@@ -0,0 +1,140 @@
+//CHECK:
+//What happens when ECQBits==1, or ECQBits==0 or ECQBits<0?
+//Rounding? Scale originalEb by 0.99?
+
+//Possible improvement: Change GAMESS format: {i i i i d} -> {i}{i}{i}{i}{d}
+//Possible improvement: Optimize bookkeeping bits
+//Possible improvement: Guess the type (C/UC, Sparse/Not)
+//Possible improvement: Get rid of writing/reading some of the indexes to in/out buffers
+//Possible improvement: Get rid of all debug stuff, including Makefile debug flags
+//Possible improvement: Get rid of "compressedBytes"
+//Possible improvement: SparseCompressed, ECQBits=2: 1's and -1's can be represented by just 0 and 1, instead 10 and 11. 
+//Possible improvement: SparseCompressed, ECQBits>2: Again: 1: 10, -1:11, Others: 0XX...XX 
+//Possible improvement: WriteBitsFast: maybe remove some masks?
+//Possible improvement: WriteBitsFast: Get rid of multiple calls!
+//Possible improvement: UCSparse: Indexes use 64 bits. It can be lowered to _1DIdxBits
+//Possible improvement: Parameters: Smaller data sizes may be possible!
+
+
+
+#ifndef PASTRI_H
+#define PASTRI_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h> //Just for debugging purposes!
+
+//#define DATASIZE 8 //Bytes per input data point.
+//We have only 1 double per data point, so it is 8 bytes.
+
+#define MAX_PS_SIZE 100
+#define MAX_BLOCK_SIZE 10000
+#define MAX_BUFSIZE 160000  //Should be a multiple of 8
+#define D_W 0 //Debug switch: Write (input block)
+#define D_R 0 //Debug switch: Read (compressed block)
+#define D_G 0 //Debug switch: General
+#define D_G2 0 //Debug switch: General 2 (a little more detail)
+#define D_C 0 //Debug switch: C
+//#define DEBUG 1 //Debug switch
+
+//#define BOOKKEEPINGBITS 0 //Currently unused
+//#define BOOKKEEPINGBITS 120 //Includes: mode, indexOffsets, compressedBytes, Pb_, ECQBits_ (8+64+32+8+8) 
+//BOOKKEEPINGBITS is defined here, because if P & S is going to be used, they appear just after the bookkeeping part.
+//This allows us to write P and S directly onto using outBuf.
+  
+
+// IMPORTANT NOTE:
+//Read/Write up to 56 bits.
+//More than that is not supported!
+
+
+/********************************************************************/
+//Datatype Declarations:
+/********************************************************************/
+typedef struct pastri_params{
+  double originalEb; //Error Bound entered by the user
+  double usedEb; //Error Bound used during compression/deceompression
+  
+  int numBlocks; //Number of blocks to be compressed
+  int dataSize; //8(=Double) or 4(=Float)
+  
+  int bf[4]; //Orbital types (basis function types). Typically in range [0,3]
+  int idxRange[4];  //Ranges of indexes. idxRange[i]=(bf[i]+1)*(bf[i]+2)/2;
+  
+  int sbSize; //=idxRange[2]*idxRange[3];
+  int sbNum;  //=idxRange[0]*idxRange[1];
+  int bSize; //=sbSize*sbNum;
+  
+  //uint16_t idxOffset[4]; //Index offset values. No longer used.
+  
+}pastri_params;
+
+//Block-specific stuff:
+typedef struct pastri_blockParams{
+  uint16_t nonZeros;
+  //int ECQ0s; //= p->bSize - numOutliers //OR: p->bSize=ECQ0s+ECQ1s+ECQOthers
+  int ECQ1s;
+  int ECQOthers;
+  int numOutliers; //=ECQ1s+ECQOthers
+  int patternBits;
+  int scaleBits;
+  double binSize;
+  double scalesBinSize;
+  uint64_t ECQExt;
+  int ECQBits;
+  int _1DIdxBits;
+}pastri_blockParams;
+
+typedef union u_UI64I64D{
+  uint64_t ui64;
+  int64_t i64;
+  double d;
+} u_UI64I64D;
+
+/********************************************************************/
+//Function Prototypes:
+/********************************************************************/
+void SZ_pastriReadParameters(char paramsFilename[512],pastri_params *paramsPtr);
+//Read the basic PaSTRI parameters from a file, speficied by paramsFilename.
+
+void SZ_pastriPreprocessParameters(pastri_params *p);
+//Using basic PaSTRI parameters, generate the others.
+//For example, block and sub-block sizes are generated by using basis function types.
+
+void SZ_pastriCompressBatch(pastri_params *p,unsigned char *originalBuf, unsigned char** compressedBufP,size_t *compressedBytes);
+//INPUTS: p, originalBuf
+//OUTPUTS: compressedBufP, compressedBytes
+//Using the inputs, compressedBufP is allocated and populated by the compressed data. Compressed size is written into compressedBytes.
+//Parameters are also stored at the beginning part of the compressedBuf
+
+void SZ_pastriDecompressBatch(unsigned char*compressedBuf, pastri_params *p, unsigned char** decompressedBufP ,size_t *decompressedBytes);
+//INPUTS: compressedBuf
+//OUTPUTS: p, decompressedBufP, decompressedBytes
+//First, parameters are read from compressedBuf and written into p.
+//Then, decompressedBufP is allocated and populated by the decompressed data. Decompressed size is written into decompressedBytes.
+
+void SZ_pastriCheckBatch(pastri_params *p,unsigned char*originalBuf,unsigned char*decompressedBuf); 
+//INPUTS: p, originalBuf, decompressedBuf
+//OUTPUTS: None (Just some on-screen messages)
+//Compares originalBuf with decompressedBuf. Checks whether the absolute error condition is satisfied or not.
+
+/********************************************************************/
+//Other Includes:
+/********************************************************************/
+
+
+
+#include "pastriGeneral.h"  //General tools
+#include "pastriD.h"  //Compression/Decompression for Double data
+#include "pastriF.h"  //Compression/Decompression for Float data
+
+
+#endif
+
+
+
+
+
diff --git a/thirdparty/SZ/sz/include/pastriD.h b/thirdparty/SZ/sz/include/pastriD.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a881e685f66e0d92ec45715a232ee2d1350d091
--- /dev/null
+++ b/thirdparty/SZ/sz/include/pastriD.h
@@ -0,0 +1,911 @@
+#ifndef PASTRID_H
+#define PASTRID_H
+
+static inline int64_t pastri_double_quantize(double x, double binSize){
+  //Add or sub 0.5, depending on the sign:
+  x=x/binSize;
+  
+  u_UI64I64D u1,half;
+  u1.d=x;
+  
+  half.d=0.5;
+  
+  //printf("pastri_double_quantize:\nx=%lf  x=0x%lx\n",x,(*((uint64_t *)(&x))));
+  //printf("sign(x):0x%lx\n", x);
+  //printf("0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  half.ui64 |= (u1.ui64 & (uint64_t)0x8000000000000000);
+  //printf("sign(x)*0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  return (int64_t)(x + half.d);
+}
+
+static inline void pastri_double_PatternMatch(double*data,pastri_params* p,pastri_blockParams* bp,int64_t* patternQ,int64_t *scalesQ, int64_t* ECQ){
+  //Find the pattern.
+  //First, find the extremum point:
+  double absExt=0; //Absolute value of Extremum
+  int extIdx=-1; //Index of Extremum
+  bp->nonZeros=0;
+  int i,sb;
+  for(i=0;i<p->bSize;i++){
+    //printf("data[%d] = %.16lf\n",i,data[i]);//DEBUG
+    if(abs_FastD(data[i])>p->usedEb){
+      bp->nonZeros++;
+      //if(DEBUG)printf("data[%d]:%.6e\n",i,data[i]); //DEBUG
+    }
+    if(abs_FastD(data[i])>absExt){
+      absExt=abs_FastD(data[i]);
+      extIdx=i;
+    }
+  }
+  int patternIdx; //Starting Index of Pattern
+  patternIdx=(extIdx/p->sbSize)*p->sbSize;
+  
+  double patternExt=data[extIdx];
+  bp->binSize=2*p->usedEb;
+  
+  //if(DEBUG){printf("Extremum  : data[%d] = %.6e\n",extIdx,patternExt);} //DEBUG
+  //if(DEBUG){printf("patternIdx: %d\n",patternIdx);} //DEBUG
+  
+  //if(DEBUG){for(i=0;i<p->sbSize;i++){printf("pattern[%d]=data[%d]=%.6e Quantized:%d\n",i,patternIdx+i,data[patternIdx+i],pastri_double_quantize(data[patternIdx+i]/binSize)  );}   }//DEBUG
+  
+  //int64_t *patternQ=(int64_t*)(outBuf+15);  //Possible Improvement!
+
+  
+  for(i=0;i<p->sbSize;i++){
+    patternQ[i]=pastri_double_quantize(data[patternIdx+i],bp->binSize);
+    if(D_W){printf("patternQ[%d]=%ld\n",i,patternQ[i]);}
+  }
+  
+  bp->patternBits=bitsNeeded_double((abs_FastD(patternExt)/bp->binSize)+1)+1;
+  bp->scaleBits=bp->patternBits;
+  bp->scalesBinSize=1/(double)(((uint64_t)1<<(bp->scaleBits-1))-1);
+  //if(DEBUG){printf("(patternExt/binSize)+1: %.6e\n",(patternExt/binSize)+1);} //DEBUG
+  //if(DEBUG){printf("scaleBits=patternBits: %d\n",scaleBits);} //DEBUG
+  if(D_W){printf("scalesBinSize: %.6e\n",bp->scalesBinSize);} //DEBUG
+  
+  //Calculate Scales.
+  //The index part of the input buffer will be reused to hold Scale, Pattern, etc. values.
+  int localExtIdx=extIdx%p->sbSize; //Local extremum index. This is not the actual extremum of the current sb, but rather the index that correspond to the global (block) extremum.
+  //int64_t *scalesQ=(int64_t*)(outBuf+15+p->sbSize*8);  //Possible Improvement!
+  int patternExtZero=(patternExt==0);
+  //if(DEBUG){printf("patternExtZero: %d\n",patternExtZero);} //DEBUG
+  for(sb=0;sb<p->sbNum;sb++){
+    //scales[sb]=data[sb*p->sbSize+localExtIdx]/patternExt;
+    //scales[sb]=patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt;
+    //assert(scales[sb]<=1);
+    scalesQ[sb]=pastri_double_quantize((patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt),bp->scalesBinSize);
+    if(D_W){printf("scalesQ[%d]=%ld\n",sb,scalesQ[sb]);}
+  }
+  //if(DEBUG){for(i=0;i<p->sbSize;i++){printf("scalesQ[%d]=%ld \n",i,scalesQ[i]);}} //DEBUG
+
+  //int64_t *ECQ=(int64_t*)(outBuf+p->bSize*8); //ECQ is written into outBuf, just be careful when handling it.
+
+  //uint64_t wVal;
+  bp->ECQExt=0;
+  int _1DIdx;
+  bp->ECQ1s=0;
+  bp->ECQOthers=0;
+  double PS_binSize=bp->scalesBinSize*bp->binSize;
+  for(sb=0;sb<p->sbNum;sb++){
+    for(i=0;i<p->sbSize;i++){
+      _1DIdx=sb*p->sbSize+i;
+      ECQ[_1DIdx]=pastri_double_quantize( (scalesQ[sb]*patternQ[i]*PS_binSize-data[_1DIdx]),bp->binSize );
+      double absECQ=abs_FastD(ECQ[_1DIdx]);
+      if(absECQ > bp->ECQExt)
+        bp->ECQExt=absECQ;
+      //if(DEBUG){printf("EC[%d]: %.6e Quantized:%ld \n",_1DIdx,(scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-data[_1DIdx]),ECQ[_1DIdx]);} //DEBUG
+      switch (ECQ[_1DIdx]){
+        case 0:
+          //ECQ0s++; //Currently not needed
+          break;
+        case 1:
+          bp->ECQ1s++;
+          break;
+        case -1:
+          bp->ECQ1s++;
+          break;
+        default:
+          bp->ECQOthers++;
+          break;
+      }
+    }
+  }
+  
+  /*
+  //DEBUG: Self-check. Remove this later.
+  for(sb=0;sb<p->sbNum;sb++){
+    for(i=0;i<p->sbSize;i++){
+      _1DIdx=sb*p->sbSize+i;
+      double decompressed=scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-ECQ[_1DIdx]*binSize;
+      if(abs_FastD(decompressed-data[_1DIdx])>(p->usedEb)){
+        printf("p->usedEb=%.6e\n",p->usedEb);
+        printf("data[%d]=%.6e decompressed[%d]=%.6e diff=%.6e\n",_1DIdx,data[_1DIdx],_1DIdx,decompressed,abs_FastD(data[_1DIdx]-decompressed));
+        assert(0);
+      }
+    }
+  }
+  */
+}
+
+static inline void pastri_double_Encode(double *data,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ,pastri_params *p,pastri_blockParams* bp,unsigned char* outBuf,int *numOutBytes){
+  bp->ECQBits=bitsNeeded_UI64(bp->ECQExt)+1;
+  bp->_1DIdxBits=bitsNeeded_UI64(p->bSize);
+  //(*numOutBytes)=0;
+  
+  int i;
+  
+  //Encode: 3 options:
+  //Compressed, Sparse ECQ
+  //Compressed, Non-Sparse ECQ
+  //Uncompressed, Sparse Data
+  //Uncompressed, Non-spsarse Data
+  
+  unsigned int UCSparseBits;  //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, nonZeros, {indexes, data}
+  unsigned int UCNonSparseBits;  //Uncompressed, NonSparse bits. Includes: mode, data
+  unsigned int CSparseBits;  //Includes: mode, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
+  unsigned int CNonSparseBits;  //Includes: mode, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
+  //int BOOKKEEPINGBITS=120; //Includes: mode, compressedBytes, patternBits, ECQBits (8+64+32+8+8) //Moved to much earlier!
+    
+  //Consider: ECQ0s, ECQ1s, ECQOthers. Number of following values in ECQ: {0}, {1,-1}, { val<=-2, val>=2}
+  //ECQ0s is actually not needed, but others are needed.
+
+  UCSparseBits = p->dataSize*(1 + 2 + bp->nonZeros*16);  //64 bits for 4 indexes, 64 bit for data.
+  UCNonSparseBits = p->dataSize*(1 + p->bSize*8);
+  bp->numOutliers=bp->ECQ1s+bp->ECQOthers;
+  if(bp->ECQBits==2){
+    CSparseBits = p->dataSize*(1+4+1+1+2) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + bp->ECQ1s*(1+bp->_1DIdxBits);
+    CNonSparseBits = p->dataSize*(1+4+1+1) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + p->bSize + bp->ECQ1s ;  //Or: ECQ0s+ECQ1s*2;
+  }else{ //ECQBits>2
+    CSparseBits = p->dataSize*(1+4+1+1+2) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + bp->ECQ1s*(2+bp->_1DIdxBits) + bp->ECQOthers*(1+bp->_1DIdxBits+bp->ECQBits);
+    //CNonSparseBits = 8+32+8+8+ patternBits*p->sbSize + scaleBits*p->sbNum + p->bSize + ECQ0s + ECQ1s*3 + ECQOthers*(2+ECQBits);
+    CNonSparseBits = p->dataSize*(1+4+1+1)+ bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + p->bSize + bp->ECQ1s*2 + bp->ECQOthers*(1+bp->ECQBits);
+  }
+  
+  int UCSparseBytes=(UCSparseBits+7)/8; 
+  int UCNonSparseBytes=(UCNonSparseBits+7)/8; 
+  int CSparseBytes=(CSparseBits+7)/8; 
+  int CNonSparseBytes=(CNonSparseBits+7)/8; 
+  uint64_t bitPos=0;
+  uint64_t bytePos=0;
+  int i0,i1,i2,i3;
+  int _1DIdx;
+  
+  //*(uint16_t*)(&outBuf[1])=p->idxOffset[0];
+  //*(uint16_t*)(&outBuf[3])=p->idxOffset[1];
+  //*(uint16_t*)(&outBuf[5])=p->idxOffset[2];
+  //*(uint16_t*)(&outBuf[7])=p->idxOffset[3];
+    
+  if(D_W){printf("ECQ0s:%d ECQ1s:%d ECQOthers:%d Total:%d\n",p->bSize-bp->ECQ1s-bp->ECQOthers,bp->ECQ1s,bp->ECQOthers,p->bSize);} //DEBUG
+  if(D_W){printf("numOutliers:%d\n",bp->numOutliers);} //DEBUG
+  
+  //****************************************************************************************
+  //if(0){ //DEBUG
+  //W:UCSparse
+  if((UCSparseBytes<UCNonSparseBytes) && (UCSparseBytes<CSparseBytes) && (UCSparseBytes<CNonSparseBytes) ){ 
+    //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, indexOffsets, nonZeros, indexes, data
+    *numOutBytes=UCSparseBytes;
+    if(D_G){printf("UCSparse\n");} //DEBUG
+    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    outBuf[0]=0; //mode
+    
+    //*(uint16_t*)(&outBuf[9])=nonZeros;
+    //bytePos=11;//0:mode, 1-8:indexOffsets 9-10:NonZeros. So start from 11.
+    *(uint16_t*)(&outBuf[1])=bp->nonZeros;
+    bytePos=3;//0:mode, 2-3:NonZeros. So start from 3.
+    
+    for(i0=0;i0<p->idxRange[0];i0++)
+      for(i1=0;i1<p->idxRange[1];i1++)
+        for(i2=0;i2<p->idxRange[2];i2++)
+          for(i3=0;i3<p->idxRange[3];i3++){
+            _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+            if(abs_FastD(data[_1DIdx])>p->usedEb){
+              //*(uint16_t*)(&outBuf[bytePos])=i0+1+p->idxOffset[0];
+              *(uint16_t*)(&outBuf[bytePos])=i0;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i1+1+p->idxOffset[1];
+              *(uint16_t*)(&outBuf[bytePos])=i1;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i2+1+p->idxOffset[2];
+              *(uint16_t*)(&outBuf[bytePos])=i2;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i3+1+p->idxOffset[3];
+              *(uint16_t*)(&outBuf[bytePos])=i3;
+              bytePos+=2;
+              
+              *(double*)(&outBuf[bytePos])=data[_1DIdx];
+              bytePos+=p->dataSize;
+            }
+          }
+    
+    if(D_G)printf("UCSparseBytes:%d \n",UCSparseBytes); //DEBUG
+    
+  //****************************************************************************************
+  //}else if(0){ //DEBUG
+  //W:UCNonSparse
+  }else if((UCNonSparseBytes<UCSparseBytes) && (UCNonSparseBytes<CSparseBytes) && (UCNonSparseBytes<CNonSparseBytes) ){ 
+    //Uncompressed, NonSparse bits. Includes: mode, indexOffsets, data
+    *numOutBytes=UCNonSparseBytes;
+    if(D_G){printf("UCNonSparse\n");} //DEBUG
+    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    outBuf[0]=1; //mode
+    
+    //memcpy(&outBuf[9], &inBuf[p->bSize*8], UCNonSparseBytes-9);
+    memcpy(&outBuf[1], data, p->bSize*p->dataSize);
+    
+    if(D_G)printf("UCNonSparseBytes:%d \n",UCNonSparseBytes); //DEBUG
+    /*
+    for(i=0;i<UCNonSparseBytes-17;i++){
+      printf("%d ",inBuf[p->bSize*8+i]);
+    }
+    printf("\n");
+    for(i=0;i<UCNonSparseBytes-17;i++){
+      printf("%d ",outBuf[17+i]);
+    }
+    printf("\n");
+    */
+  //****************************************************************************************
+  //}else if(1){ //DEBUG
+  //W:CSparse
+  }else if((CSparseBytes<UCNonSparseBytes) && (CSparseBytes<UCSparseBytes) && (CSparseBytes<CNonSparseBytes) ){ 
+    //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
+    *numOutBytes=CSparseBytes;
+    if(D_G){printf("CSparse\n");} //DEBUG
+    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    //if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    outBuf[0]=2; //mode
+    
+    ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
+    //outBuf[13]=patternBits;
+    //outBuf[14]=ECQBits;
+    ////Currently, we are at the end of 15th byte.
+    //*(uint16_t*)(&outBuf[15])=numOutliers;
+    //bitPos=17*8; //Currently, we are at the end of 17th byte.
+    
+    //outBuf bytes [1:4] are reserved for compressedBytes.
+    outBuf[5]=bp->patternBits;
+    outBuf[6]=bp->ECQBits;
+    //Currently, we are at the end of 7th byte.
+    
+    *(uint16_t*)(&outBuf[7])=bp->numOutliers; 
+    //Now, we are at the end of 9th byte.
+    bitPos=9*8; 
+    
+    //if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+
+    for(i=0;i<p->sbSize;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
+    }
+    //if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    for(i=0;i<p->sbNum;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
+    }
+    //if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    //if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    switch(bp->ECQBits){
+      case 2:
+        for(i=0;i<p->bSize;i++){
+          switch(ECQ[i]){
+            case 0:
+              break;
+            case 1:
+              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x0\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+              //writeBits_Fast(outBuf,&bitPos,2,0x10);
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              writeBits_Fast(outBuf,&bitPos,1,0);//0x00
+              break;
+            case -1:
+              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+              //writeBits_Fast(outBuf,&bitPos,2,0x11);
+              //writeBits_Fast(outBuf,&bitPos,2,1);//0x01
+              //writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,1);
+              break;
+            default:
+              assert(0);
+              break;
+          }
+        }
+        break;
+      default: //ECQBits>2
+      for(i=0;i<p->bSize;i++){
+        switch(ECQ[i]){
+          case 0:
+            break;
+          case 1:
+            //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x00\n",i,ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,3,0);//0x000
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            break;
+          case -1:
+            //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01\n",i,ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,3,1);//0x001
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,1);
+            break;
+          default:
+            //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1 0x%lx\n",i,ECQ[i],ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,2+ECQBits,((uint64_t)0x11<<ECQBits)|ECQ[i]);
+            //writeBits_Fast(outBuf,&bitPos,2+ECQBits,(ECQ[i]&((uint64_t)0x00<<ECQBits))|((uint64_t)0x01<<ECQBits));
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,1);
+            writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
+            break;
+        }
+      }
+      break;
+    }
+    
+    //if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+          
+
+    uint32_t bytePos=(bitPos+7)/8;
+    //*(uint32_t*)(&outBuf[9])=bytePos;
+    *(uint32_t*)(&outBuf[1])=bytePos;
+    
+    if(D_G)printf("bitPos:%ld CSparseBits:%d bytePos:%d CSparseBytes:%d\n",bitPos,CSparseBits,bytePos,CSparseBytes); //DEBUG
+    if(D_G){assert(bitPos==CSparseBits);}
+    
+  //****************************************************************************************
+  //W:CNonSparse
+  }else { 
+    //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
+    *numOutBytes=CNonSparseBytes;
+    if(D_G){printf("CNonSparse\n");} //DEBUG
+    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    //if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    outBuf[0]=3; //mode
+    
+    ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
+    //outBuf[13]=patternBits;
+    //outBuf[14]=ECQBits;
+    //bitPos=15*8; //Currently, we are at the end of 15th byte.
+    
+    //outBuf bytes [1:4] are reserved for compressedBytes.
+    outBuf[5]=bp->patternBits;
+    outBuf[6]=bp->ECQBits;
+    bitPos=7*8; //Currently, we are at the end of 7th byte.
+    
+    //if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+
+    for(i=0;i<p->sbSize;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
+    }
+    //if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    for(i=0;i<p->sbNum;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
+    }
+    //if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    //if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    switch(bp->ECQBits){
+      case 2:
+        for(i=0;i<p->bSize;i++){
+          switch(ECQ[i]){
+            case 0:
+              //if(DEBUG)printf("Index:%d ECQ:%d Written:0x1\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,1,1);//0x1
+              break;
+            case 1:
+              //if(DEBUG)printf("Index:%d ECQ:%d Written:0x00\n",i,ECQ[i]); //DEBUG
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              break;
+            case -1:
+              //if(DEBUG)printf("Index:%d ECQ:%d Written:0x01\n",i,ECQ[i]); //DEBUG
+              //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,1);
+              break;
+            default:
+              assert(0);
+              break;
+          }
+        }
+        break;
+      default: //ECQBits>2
+        //if(DEBUG) printf("AMG_W1:bitPos:%ld\n",bitPos); //DEBUG
+        for(i=0;i<p->bSize;i++){
+          //if(DEBUG){printf("AMG_W3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+          //if(DEBUG) printf("AMG_W2:bitPos:%ld\n",bitPos); //DEBUG
+          //if(DEBUG) printf("ECQ[%d]:%ld\n",i,ECQ[i]); //DEBUG
+          switch(ECQ[i]){
+            case 0:
+              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              writeBits_Fast(outBuf,&bitPos,1,1);  //0x1
+              //wVal=1; writeBits_Fast(outBuf,&bitPos,1,wVal); //0x1
+              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            case 1:
+              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x000\n",i,ECQ[i]); //DEBUG
+              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,3,0); //0x000
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              //wVal=0; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x000
+              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            case -1:
+              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x001\n",i,ECQ[i]); //DEBUG
+              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,3,8); //0x001
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,1); 
+              //wVal=8; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x001
+              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            default:
+              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01 0x%lx\n",i,ECQ[i]); //DEBUG
+              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,1); 
+              //wVal=2; writeBits_Fast(outBuf,&bitPos,2,wVal); //0x01
+              writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
+              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+          }
+        }
+        break;
+    }
+    
+    //if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    
+          
+
+    uint32_t bytePos=(bitPos+7)/8;
+    //*(uint32_t*)(&outBuf[9])=bytePos;
+    *(uint32_t*)(&outBuf[1])=bytePos;
+    
+    if(D_G)printf("bitPos:%ld CNonSparseBits:%d bytePos:%d CNonSparseBytes:%d\n",bitPos,CNonSparseBits,bytePos,CNonSparseBytes); //DEBUG
+    if(D_G){assert(bitPos==CNonSparseBits);}
+    
+  }
+  //for(i=213;i<233;i++)if(DEBUG)printf("AMG_WE:bitPos:%d buffer[%d]=0x%lx\n",i*8,i,*(uint64_t*)(&outBuf[i])); //DEBUG
+  
+}
+static inline int pastri_double_Compress(unsigned char*inBuf,pastri_params *p,unsigned char*outBuf,int *numOutBytes){
+  pastri_blockParams bp;
+
+  if(D_G2){printf("Parameters: dataSize:%d\n",p->dataSize);}  //DEBUG
+  if(D_G2){printf("Parameters: bfs:%d %d %d %d originalEb:%.3e\n",p->bf[0],p->bf[1],p->bf[2],p->bf[3],p->usedEb);}  //DEBUG
+  if(D_G2){printf("Parameters: idxRanges:%d %d %d %d\n",p->idxRange[0],p->idxRange[1],p->idxRange[2],p->idxRange[3]);} //DEBUG
+  if(D_G2){printf("Parameters: sbSize:%d sbNum:%d bSize:%d\n",p->sbSize,p->sbNum,p->bSize); }//DEBUG
+  
+  int64_t patternQ[MAX_PS_SIZE];
+  int64_t scalesQ[MAX_PS_SIZE];
+  int64_t ECQ[MAX_BLOCK_SIZE];
+
+  double *data;
+  data=(double*)inBuf;
+  
+  //STEP 0: PREPROCESSING:
+  //This step can include flattening the block, determining the period, etc.
+  //Currently not needed.
+  
+  //STEP 1: PATTERN MATCH
+  pastri_double_PatternMatch(data,p,&bp,patternQ,scalesQ,ECQ);
+  
+  //STEP 2: ENCODING(Include QUANTIZE)
+  pastri_double_Encode(data,patternQ,scalesQ,ECQ,p,&bp,outBuf,numOutBytes);
+  
+
+  return 0;
+}
+
+static inline double pastri_double_InverseQuantization(int64_t q, double binSize){
+  return q*binSize;
+}
+
+static inline void pastri_double_PredictData(pastri_params *p,pastri_blockParams *bp,double *data,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ){
+  int j;
+  double PS_binSize=bp->scalesBinSize*bp->binSize;
+  for(j=0;j<p->bSize;j++){
+    //data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*PS_binSize - ECQ[j]*bp->binSize;
+    data[j]=pastri_double_InverseQuantization(scalesQ[j/p->sbSize]*patternQ[j%p->sbSize],PS_binSize) - pastri_double_InverseQuantization(ECQ[j],bp->binSize);
+  }
+}
+
+static inline void pastri_double_Decode(unsigned char*inBuf,pastri_params *p,pastri_blockParams *bp,unsigned char*outBuf,int *numReadBytes,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ){
+  int j;
+  bp->_1DIdxBits=bitsNeeded_UI64(p->bSize);
+  //double *data=(double*)(outBuf+p->bSize*8);
+  double *data=(double*)(outBuf);
+  int i0,i1,i2,i3;
+  //uint16_t *idx0,*idx1,*idx2,*idx3;
+  int _1DIdx;
+
+  int64_t ECQTemp;
+  uint64_t bytePos=0;
+  uint64_t bitPos=0;
+  uint64_t temp,temp2;
+  //int sb,localIdx;
+
+  
+  //idx0=(uint16_t*)(outBuf           );
+  //idx1=(uint16_t*)(outBuf+p->bSize*2);
+  //idx2=(uint16_t*)(outBuf+p->bSize*4);
+  //idx3=(uint16_t*)(outBuf+p->bSize*6);
+  //p->idxOffset[0]=*(uint32_t*)(&inBuf[1]);
+  //p->idxOffset[1]=*(uint32_t*)(&inBuf[3]);
+  //p->idxOffset[2]=*(uint32_t*)(&inBuf[5]);
+  //p->idxOffset[3]=*(uint32_t*)(&inBuf[7]);
+  /*
+  for(i0=0;i0<p->idxRange[0];i0++)
+    for(i1=0;i1<p->idxRange[1];i1++)
+      for(i2=0;i2<p->idxRange[2];i2++)
+        for(i3=0;i3<p->idxRange[3];i3++){
+            //_1DIdx=i0*p->idxRange[1]*p->idxRange[2]*p->idxRange[3]+i1*p->idxRange[2]*p->idxRange[3]+i2*p->idxRange[3]+i3;
+            _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+            idx0[_1DIdx]=i0+1+p->idxOffset[0];
+            idx1[_1DIdx]=i1+1+p->idxOffset[1];
+            idx2[_1DIdx]=i2+1+p->idxOffset[2];
+            idx3[_1DIdx]=i3+1+p->idxOffset[3];
+        }
+  */
+  
+  //*numOutBytes=p->bSize*16;  
+  
+  //inBuf[0] is "mode"
+  switch(inBuf[0]){
+    //R:UCSparse
+    case 0:
+      if(D_G){printf("\nDC:UCSparse\n");} //DEBUG
+      //bp->nonZeros=*(uint16_t*)(&inBuf[9]);
+      //bytePos=11;
+      bp->nonZeros=*(uint16_t*)(&inBuf[1]);
+      bytePos=3;
+      for(j=0;j<p->bSize;j++){
+          data[j]=0;
+      }
+      for(j=0;j<bp->nonZeros;j++){
+        //i0=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[0]; //i0
+        i0=*(uint16_t*)(&inBuf[bytePos]); //i0
+        bytePos+=2;
+        //i1=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[1]; //i1
+        i1=*(uint16_t*)(&inBuf[bytePos]); //i1
+        bytePos+=2;
+        //i2=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[2]; //i2
+        i2=*(uint16_t*)(&inBuf[bytePos]); //i2
+        bytePos+=2;
+        //i3=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[3]; //i3
+        i3=*(uint16_t*)(&inBuf[bytePos]); //i3
+        bytePos+=2;
+        _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+        data[_1DIdx]=*(double*)(&inBuf[bytePos]);
+        bytePos+=8; 
+      }
+      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      break;
+    //R:UCNonSparse
+    case 1:
+      if(D_G){printf("\nDC:UCNonSparse\n");} //DEBUG
+      //memcpy(&outBuf[p->bSize*8], &inBuf[9], p->bSize*8);
+      memcpy(data, &inBuf[1], p->bSize*8);
+      bytePos=p->bSize*8;
+      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      break;
+    //R:CSparse
+    case 2:
+      if(D_G){printf("\nDC:CSparse\n");} //DEBUG
+      //for(j=0;j<p->bSize;j++){
+      //  data[j]=0;
+      //}
+      
+      //bp->patternBits=inBuf[13];
+      //bp->ECQBits=inBuf[14];      
+      
+      bp->patternBits=inBuf[5];
+      bp->ECQBits=inBuf[6];
+      
+      if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      
+      //bp->numOutliers=*(uint16_t*)(&inBuf[15]);
+      //bitPos=17*8;
+      bp->numOutliers=*(uint16_t*)(&inBuf[7]);
+      bitPos=9*8;
+      if(D_R){printf("bp->numOutliers:%d\n",bp->numOutliers);} //DEBUG
+
+      bp->scalesBinSize=1/(double)(((uint64_t)1<<(bp->patternBits-1))-1);
+  
+      bp->binSize=p->usedEb*2;
+      
+      if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+
+      for(j=0;j<p->sbSize;j++){
+        patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
+        if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+      }
+      for(j=0;j<p->sbNum;j++){
+        scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
+        if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+      }
+      
+      /* //Splitting
+      for(j=0;j<p->bSize;j++){
+        data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
+      }
+      */
+      for(j=0;j<p->bSize;j++){
+        ECQ[j]=0;
+      }
+      switch(bp->ECQBits){
+        case 2:
+          for(j=0;j<bp->numOutliers;j++){
+            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            
+            _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            ECQTemp=readBits_I64(inBuf,&bitPos,1);
+            ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+            //if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            ////data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
+            ECQ[_1DIdx]=ECQTemp;
+            
+            //if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+        default: //bp->ECQBits>2
+          if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: bp->ECQBits:%d bp->numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    
+          for(j=0;j<bp->numOutliers;j++){
+            _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            //if(DEBUG){printf("temp:%ld\n",temp);} //DEBUG
+            switch(temp){
+              case 0:  //+-1
+                ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                //if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                break;
+              case 1: //Others
+                ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
+                //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                //if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                break;
+              //default:
+              //  printf("ERROR: Bad 2-bit value: 0x%lx",temp);
+              // assert(0); //AMG
+              //  break;
+            }
+            
+            //data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
+            ECQ[_1DIdx]=ECQTemp;
+            
+            //if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+      }
+      //static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,uint64_t numBits){ // numBits must be in range [0:56]
+      //patternQ=(int64_t*)(inBuf+15); 
+      //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
+      
+      bytePos=(bitPos+7)/8;
+      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      
+      //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
+      pastri_double_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
+
+      break;
+    //R:CNonSparse
+    case 3:
+      if(D_G){printf("\nDC:CNonSparse\n");} //DEBUG
+      
+      //for(j=0;j<p->bSize;j++){
+      //  data[j]=0;
+      //}
+      
+      //bp->patternBits=inBuf[13];
+      //bp->ECQBits=inBuf[14];
+      
+      bp->patternBits=inBuf[5];
+      bp->ECQBits=inBuf[6];
+      
+      if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      
+      //bitPos=15*8;
+      bitPos=7*8;
+
+      bp->scalesBinSize=1/(double)(((uint64_t)1<<(bp->patternBits-1))-1);
+      bp->binSize=p->usedEb*2;
+      
+      if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+
+      for(j=0;j<p->sbSize;j++){
+        patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
+        if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+      }
+      for(j=0;j<p->sbNum;j++){
+        scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
+        if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+      }
+      /* //Splitting
+      for(j=0;j<p->bSize;j++){
+        data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
+        //if(DEBUG){printf("DC:PS[%d]=%.6e\n",j,data[j]);}
+      }
+      */
+      switch(bp->ECQBits){
+        case 2:
+          for(j=0;j<p->bSize;j++){
+            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            switch(temp){
+              case 0:
+                ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                break;
+              case 1:
+                ECQTemp=0;
+                break;
+              default:
+                assert(0);
+                break;
+            }
+            
+            //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            //data[j]-=ECQTemp*bp->binSize; //Splitting
+            ECQ[j]=ECQTemp;
+            
+            //if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+        default: //bp->ECQBits>2
+          //if(DEBUG)printf("AMG_R1:bitPos: %ld\n",bitPos);
+          
+          for(j=0;j<p->bSize;j++){
+            //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            //if(DEBUG)printf("AMG_R2:bitPos: %ld\n",bitPos);
+
+            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            switch(temp){
+              case 0:
+                //if(DEBUG)printf("Read:0");
+                temp2=readBits_UI64(inBuf,&bitPos,1);
+                switch(temp2){
+                  case 0:
+                    //if(DEBUG)printf("0");
+                    ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                    //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    //if(DEBUG)printf("R:ECQTemp:%ld\n",ECQTemp);
+                    ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                    //if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    break;
+                  case 1:
+                    //if(DEBUG)printf("1\n");
+                    ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
+                    //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    //if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    break;
+                  default:
+                    assert(0);
+                    break;
+                }
+                break;
+              case 1:
+                //if(DEBUG)printf("Read:1\n");
+                ECQTemp=0;
+                //if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                break;
+              default:
+                assert(0);
+                break;
+            }
+            
+            //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            //data[j]-=ECQTemp*bp->binSize; //Splitting
+            ECQ[j]=ECQTemp;
+            
+            //if(DEBUG){printf("DC:data[%d]:%.6e\n",j,data[j]);} //DEBUG
+          }
+          break;
+      }
+      //static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,uint64_t numBits){ // numBits must be in range [0:56]
+      //patternQ=(int64_t*)(inBuf+15); 
+      //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
+      bytePos=(bitPos+7)/8;
+      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      
+      //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
+      pastri_double_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
+      break;
+      
+    default:
+      assert(0);
+      break;
+  } 
+  (*numReadBytes)=bytePos;
+}
+
+static inline void pastri_double_Decompress(unsigned char*inBuf,int dataSize,pastri_params *p,unsigned char*outBuf,int *numReadBytes){
+  int64_t patternQ[MAX_PS_SIZE]; 
+  int64_t scalesQ[MAX_PS_SIZE];
+  int64_t ECQ[MAX_BLOCK_SIZE];
+  
+  pastri_blockParams bp;
+  
+  //STEP 1: DECODE (Includes PREDICT DATA(Includes INVERSE QUANTIZATION))
+  //(Further steps are called inside pastri_double_Decode function)
+  pastri_double_Decode(inBuf,p,&bp,outBuf,numReadBytes,patternQ,scalesQ,ECQ);
+
+  return;
+}
+
+//inBuf vs Decompressed
+static inline int pastri_double_Check(unsigned char*inBuf,int dataSize,unsigned char*DC,pastri_params *p){
+  int i;
+  
+  double *data=(double*)(inBuf);
+  double *data_dc=(double*)(DC);
+  
+  //Comparing Indexes:
+  /*
+  for(i=0;i<p->bSize;i++){
+    if(idx0[i]!=idx0_dc[i]){
+      printf("idx0[%d]=%d  !=  %d=idx0_dc[%d]",i,idx0[i],idx0_dc[i],i);
+      assert(0);
+    }
+    if(idx1[i]!=idx1_dc[i]){
+      printf("idx1[%d]=%d  !=  %d=idx1_dc[%d]",i,idx1[i],idx1_dc[i],i);
+      assert(0);
+    }
+    if(idx2[i]!=idx2_dc[i]){
+      printf("idx2[%d]=%d  !=  %d=idx2_dc[%d]",i,idx2[i],idx2_dc[i],i);
+      assert(0);
+    }
+    if(idx3[i]!=idx3_dc[i]){
+      printf("idx3[%d]=%d  !=  %d=idx3_dc[%d]",i,idx3[i],idx3_dc[i],i);
+      assert(0);
+    }
+  }
+  */
+  
+  //Comparing Data:
+  for(i=0;i<p->bSize;i++){
+    if(abs_FastD(data[i]-data_dc[i])>p->usedEb){
+      printf("|data[%d]-data_dc[%d]|>originalEb : %.3e - %.3e = %.3e > %.3e\n",i,i,data[i],data_dc[i],abs_FastD(data[i]-data_dc[i]),p->usedEb);
+      assert(0);
+    }
+  }
+  return 0;
+}
+
+
+#endif
diff --git a/thirdparty/SZ/sz/include/pastriF.h b/thirdparty/SZ/sz/include/pastriF.h
new file mode 100644
index 0000000000000000000000000000000000000000..08c9c140d80ee6d600234e3e99286d3e5c35c627
--- /dev/null
+++ b/thirdparty/SZ/sz/include/pastriF.h
@@ -0,0 +1,911 @@
+#ifndef PASTRIF_H
+#define PASTRIF_H
+
+static inline int64_t pastri_float_quantize(float x, float binSize){
+  //Add or sub 0.5, depending on the sign:
+  x=x/binSize;
+  
+  u_UI64I64D u1,half;
+  u1.d=x;
+  
+  half.d=0.5;
+  
+  //printf("pastri_float_quantize:\nx=%lf  x=0x%lx\n",x,(*((uint64_t *)(&x))));
+  //printf("sign(x):0x%lx\n", x);
+  //printf("0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  half.ui64 |= (u1.ui64 & (uint64_t)0x8000000000000000);
+  //printf("sign(x)*0.5:0x%lx\n", (*((uint64_t *)(&half))));
+  return (int64_t)(x + half.d);
+}
+
+static inline void pastri_float_PatternMatch(float*data,pastri_params* p,pastri_blockParams* bp,int64_t* patternQ,int64_t *scalesQ, int64_t* ECQ){
+  //Find the pattern.
+  //First, find the extremum point:
+  float absExt=0; //Absolute value of Extremum
+  int extIdx=-1; //Index of Extremum
+  bp->nonZeros=0;
+  int i,sb;
+  for(i=0;i<p->bSize;i++){
+    //printf("data[%d] = %.16lf\n",i,data[i]);//DEBUG
+    if(abs_FastD(data[i])>p->usedEb){
+      bp->nonZeros++;
+      //if(DEBUG)printf("data[%d]:%.6e\n",i,data[i]); //DEBUG
+    }
+    if(abs_FastD(data[i])>absExt){
+      absExt=abs_FastD(data[i]);
+      extIdx=i;
+    }
+  }
+  int patternIdx; //Starting Index of Pattern
+  patternIdx=(extIdx/p->sbSize)*p->sbSize;
+  
+  float patternExt=data[extIdx];
+  bp->binSize=2*p->usedEb;
+  
+  //if(DEBUG){printf("Extremum  : data[%d] = %.6e\n",extIdx,patternExt);} //DEBUG
+  //if(DEBUG){printf("patternIdx: %d\n",patternIdx);} //DEBUG
+  
+  //if(DEBUG){for(i=0;i<p->sbSize;i++){printf("pattern[%d]=data[%d]=%.6e Quantized:%d\n",i,patternIdx+i,data[patternIdx+i],pastri_float_quantize(data[patternIdx+i]/binSize)  );}   }//DEBUG
+  
+  //int64_t *patternQ=(int64_t*)(outBuf+15);  //Possible Improvement!
+
+  
+  for(i=0;i<p->sbSize;i++){
+    patternQ[i]=pastri_float_quantize(data[patternIdx+i],bp->binSize);
+    if(D_W){printf("patternQ[%d]=%ld\n",i,patternQ[i]);}
+  }
+  
+  bp->patternBits=bitsNeeded_float((abs_FastD(patternExt)/bp->binSize)+1)+1;
+  bp->scaleBits=bp->patternBits;
+  bp->scalesBinSize=1/(float)(((uint64_t)1<<(bp->scaleBits-1))-1);
+  //if(DEBUG){printf("(patternExt/binSize)+1: %.6e\n",(patternExt/binSize)+1);} //DEBUG
+  //if(DEBUG){printf("scaleBits=patternBits: %d\n",scaleBits);} //DEBUG
+  if(D_W){printf("scalesBinSize: %.6e\n",bp->scalesBinSize);} //DEBUG
+  
+  //Calculate Scales.
+  //The index part of the input buffer will be reused to hold Scale, Pattern, etc. values.
+  int localExtIdx=extIdx%p->sbSize; //Local extremum index. This is not the actual extremum of the current sb, but rather the index that correspond to the global (block) extremum.
+  //int64_t *scalesQ=(int64_t*)(outBuf+15+p->sbSize*8);  //Possible Improvement!
+  int patternExtZero=(patternExt==0);
+  //if(DEBUG){printf("patternExtZero: %d\n",patternExtZero);} //DEBUG
+  for(sb=0;sb<p->sbNum;sb++){
+    //scales[sb]=data[sb*p->sbSize+localExtIdx]/patternExt;
+    //scales[sb]=patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt;
+    //assert(scales[sb]<=1);
+    scalesQ[sb]=pastri_float_quantize((patternExtZero ? 0 : data[sb*p->sbSize+localExtIdx]/patternExt),bp->scalesBinSize);
+    if(D_W){printf("scalesQ[%d]=%ld\n",sb,scalesQ[sb]);}
+  }
+  //if(DEBUG){for(i=0;i<p->sbSize;i++){printf("scalesQ[%d]=%ld \n",i,scalesQ[i]);}} //DEBUG
+
+  //int64_t *ECQ=(int64_t*)(outBuf+p->bSize*8); //ECQ is written into outBuf, just be careful when handling it.
+
+  //uint64_t wVal;
+  bp->ECQExt=0;
+  int _1DIdx;
+  bp->ECQ1s=0;
+  bp->ECQOthers=0;
+  float PS_binSize=bp->scalesBinSize*bp->binSize;
+  for(sb=0;sb<p->sbNum;sb++){
+    for(i=0;i<p->sbSize;i++){
+      _1DIdx=sb*p->sbSize+i;
+      ECQ[_1DIdx]=pastri_float_quantize( (scalesQ[sb]*patternQ[i]*PS_binSize-data[_1DIdx]),bp->binSize );
+      float absECQ=abs_FastD(ECQ[_1DIdx]);
+      if(absECQ > bp->ECQExt)
+        bp->ECQExt=absECQ;
+      //if(DEBUG){printf("EC[%d]: %.6e Quantized:%ld \n",_1DIdx,(scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-data[_1DIdx]),ECQ[_1DIdx]);} //DEBUG
+      switch (ECQ[_1DIdx]){
+        case 0:
+          //ECQ0s++; //Currently not needed
+          break;
+        case 1:
+          bp->ECQ1s++;
+          break;
+        case -1:
+          bp->ECQ1s++;
+          break;
+        default:
+          bp->ECQOthers++;
+          break;
+      }
+    }
+  }
+  
+  /*
+  //DEBUG: Self-check. Remove this later.
+  for(sb=0;sb<p->sbNum;sb++){
+    for(i=0;i<p->sbSize;i++){
+      _1DIdx=sb*p->sbSize+i;
+      float decompressed=scalesQ[sb]*patternQ[i]*scalesBinSize*binSize-ECQ[_1DIdx]*binSize;
+      if(abs_FastD(decompressed-data[_1DIdx])>(p->usedEb)){
+        printf("p->usedEb=%.6e\n",p->usedEb);
+        printf("data[%d]=%.6e decompressed[%d]=%.6e diff=%.6e\n",_1DIdx,data[_1DIdx],_1DIdx,decompressed,abs_FastD(data[_1DIdx]-decompressed));
+        assert(0);
+      }
+    }
+  }
+  */
+}
+
+static inline void pastri_float_Encode(float *data,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ,pastri_params *p,pastri_blockParams* bp,unsigned char* outBuf,int *numOutBytes){
+  bp->ECQBits=bitsNeeded_UI64(bp->ECQExt)+1;
+  bp->_1DIdxBits=bitsNeeded_UI64(p->bSize);
+  //(*numOutBytes)=0;
+  
+  int i;
+  
+  //Encode: 3 options:
+  //Compressed, Sparse ECQ
+  //Compressed, Non-Sparse ECQ
+  //Uncompressed, Sparse Data
+  //Uncompressed, Non-spsarse Data
+  
+  unsigned int UCSparseBits;  //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, nonZeros, {indexes, data}
+  unsigned int UCNonSparseBits;  //Uncompressed, NonSparse bits. Includes: mode, data
+  unsigned int CSparseBits;  //Includes: mode, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
+  unsigned int CNonSparseBits;  //Includes: mode, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
+  //int BOOKKEEPINGBITS=120; //Includes: mode, compressedBytes, patternBits, ECQBits (8+64+32+8+8) //Moved to much earlier!
+    
+  //Consider: ECQ0s, ECQ1s, ECQOthers. Number of following values in ECQ: {0}, {1,-1}, { val<=-2, val>=2}
+  //ECQ0s is actually not needed, but others are needed.
+
+  UCSparseBits = p->dataSize*(1 + 2 + bp->nonZeros*16);  //64 bits for 4 indexes, 64 bit for data.
+  UCNonSparseBits = p->dataSize*(1 + p->bSize*8);
+  bp->numOutliers=bp->ECQ1s+bp->ECQOthers;
+  if(bp->ECQBits==2){
+    CSparseBits = p->dataSize*(1+4+1+1+2) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + bp->ECQ1s*(1+bp->_1DIdxBits);
+    CNonSparseBits = p->dataSize*(1+4+1+1) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + p->bSize + bp->ECQ1s ;  //Or: ECQ0s+ECQ1s*2;
+  }else{ //ECQBits>2
+    CSparseBits = p->dataSize*(1+4+1+1+2) + bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + bp->ECQ1s*(2+bp->_1DIdxBits) + bp->ECQOthers*(1+bp->_1DIdxBits+bp->ECQBits);
+    //CNonSparseBits = 8+32+8+8+ patternBits*p->sbSize + scaleBits*p->sbNum + p->bSize + ECQ0s + ECQ1s*3 + ECQOthers*(2+ECQBits);
+    CNonSparseBits = p->dataSize*(1+4+1+1)+ bp->patternBits*p->sbSize + bp->scaleBits*p->sbNum + p->bSize + bp->ECQ1s*2 + bp->ECQOthers*(1+bp->ECQBits);
+  }
+  
+  int UCSparseBytes=(UCSparseBits+7)/8; 
+  int UCNonSparseBytes=(UCNonSparseBits+7)/8; 
+  int CSparseBytes=(CSparseBits+7)/8; 
+  int CNonSparseBytes=(CNonSparseBits+7)/8; 
+  uint64_t bitPos=0;
+  uint64_t bytePos=0;
+  int i0,i1,i2,i3;
+  int _1DIdx;
+  
+  //*(uint16_t*)(&outBuf[1])=p->idxOffset[0];
+  //*(uint16_t*)(&outBuf[3])=p->idxOffset[1];
+  //*(uint16_t*)(&outBuf[5])=p->idxOffset[2];
+  //*(uint16_t*)(&outBuf[7])=p->idxOffset[3];
+    
+  if(D_W){printf("ECQ0s:%d ECQ1s:%d ECQOthers:%d Total:%d\n",p->bSize-bp->ECQ1s-bp->ECQOthers,bp->ECQ1s,bp->ECQOthers,p->bSize);} //DEBUG
+  if(D_W){printf("numOutliers:%d\n",bp->numOutliers);} //DEBUG
+  
+  //****************************************************************************************
+  //if(0){ //DEBUG
+  //W:UCSparse
+  if((UCSparseBytes<UCNonSparseBytes) && (UCSparseBytes<CSparseBytes) && (UCSparseBytes<CNonSparseBytes) ){ 
+    //Uncompressed, Sparse bits. Just like the original GAMESS data. Includes: mode, indexOffsets, nonZeros, indexes, data
+    *numOutBytes=UCSparseBytes;
+    if(D_G){printf("UCSparse\n");} //DEBUG
+    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    outBuf[0]=0; //mode
+    
+    //*(uint16_t*)(&outBuf[9])=nonZeros;
+    //bytePos=11;//0:mode, 1-8:indexOffsets 9-10:NonZeros. So start from 11.
+    *(uint16_t*)(&outBuf[1])=bp->nonZeros;
+    bytePos=3;//0:mode, 2-3:NonZeros. So start from 3.
+    
+    for(i0=0;i0<p->idxRange[0];i0++)
+      for(i1=0;i1<p->idxRange[1];i1++)
+        for(i2=0;i2<p->idxRange[2];i2++)
+          for(i3=0;i3<p->idxRange[3];i3++){
+            _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+            if(abs_FastD(data[_1DIdx])>p->usedEb){
+              //*(uint16_t*)(&outBuf[bytePos])=i0+1+p->idxOffset[0];
+              *(uint16_t*)(&outBuf[bytePos])=i0;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i1+1+p->idxOffset[1];
+              *(uint16_t*)(&outBuf[bytePos])=i1;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i2+1+p->idxOffset[2];
+              *(uint16_t*)(&outBuf[bytePos])=i2;
+              bytePos+=2;
+              //*(uint16_t*)(&outBuf[bytePos])=i3+1+p->idxOffset[3];
+              *(uint16_t*)(&outBuf[bytePos])=i3;
+              bytePos+=2;
+              
+              *(float*)(&outBuf[bytePos])=data[_1DIdx];
+              bytePos+=p->dataSize;
+            }
+          }
+    
+    if(D_G)printf("UCSparseBytes:%d \n",UCSparseBytes); //DEBUG
+    
+  //****************************************************************************************
+  //}else if(0){ //DEBUG
+  //W:UCNonSparse
+  }else if((UCNonSparseBytes<UCSparseBytes) && (UCNonSparseBytes<CSparseBytes) && (UCNonSparseBytes<CNonSparseBytes) ){ 
+    //Uncompressed, NonSparse bits. Includes: mode, indexOffsets, data
+    *numOutBytes=UCNonSparseBytes;
+    if(D_G){printf("UCNonSparse\n");} //DEBUG
+    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    outBuf[0]=1; //mode
+    
+    //memcpy(&outBuf[9], &inBuf[p->bSize*8], UCNonSparseBytes-9);
+    memcpy(&outBuf[1], data, p->bSize*p->dataSize);
+    
+    if(D_G)printf("UCNonSparseBytes:%d \n",UCNonSparseBytes); //DEBUG
+    /*
+    for(i=0;i<UCNonSparseBytes-17;i++){
+      printf("%d ",inBuf[p->bSize*8+i]);
+    }
+    printf("\n");
+    for(i=0;i<UCNonSparseBytes-17;i++){
+      printf("%d ",outBuf[17+i]);
+    }
+    printf("\n");
+    */
+  //****************************************************************************************
+  //}else if(1){ //DEBUG
+  //W:CSparse
+  }else if((CSparseBytes<UCNonSparseBytes) && (CSparseBytes<UCSparseBytes) && (CSparseBytes<CNonSparseBytes) ){ 
+    //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,numOutliers,P, S, {Indexes(Sparse), ECQ}
+    *numOutBytes=CSparseBytes;
+    if(D_G){printf("CSparse\n");} //DEBUG
+    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    //if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    outBuf[0]=2; //mode
+    
+    ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
+    //outBuf[13]=patternBits;
+    //outBuf[14]=ECQBits;
+    ////Currently, we are at the end of 15th byte.
+    //*(uint16_t*)(&outBuf[15])=numOutliers;
+    //bitPos=17*8; //Currently, we are at the end of 17th byte.
+    
+    //outBuf bytes [1:4] are reserved for compressedBytes.
+    outBuf[5]=bp->patternBits;
+    outBuf[6]=bp->ECQBits;
+    //Currently, we are at the end of 7th byte.
+    
+    *(uint16_t*)(&outBuf[7])=bp->numOutliers; 
+    //Now, we are at the end of 9th byte.
+    bitPos=9*8; 
+    
+    //if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+
+    for(i=0;i<p->sbSize;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
+    }
+    //if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    for(i=0;i<p->sbNum;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
+    }
+    //if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    //if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    switch(bp->ECQBits){
+      case 2:
+        for(i=0;i<p->bSize;i++){
+          switch(ECQ[i]){
+            case 0:
+              break;
+            case 1:
+              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x0\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+              //writeBits_Fast(outBuf,&bitPos,2,0x10);
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              writeBits_Fast(outBuf,&bitPos,1,0);//0x00
+              break;
+            case -1:
+              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+              //writeBits_Fast(outBuf,&bitPos,2,0x11);
+              //writeBits_Fast(outBuf,&bitPos,2,1);//0x01
+              //writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,1);
+              break;
+            default:
+              assert(0);
+              break;
+          }
+        }
+        break;
+      default: //ECQBits>2
+      for(i=0;i<p->bSize;i++){
+        switch(ECQ[i]){
+          case 0:
+            break;
+          case 1:
+            //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x00\n",i,ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,3,0);//0x000
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            break;
+          case -1:
+            //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01\n",i,ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,3,1);//0x001
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,1);
+            break;
+          default:
+            //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1 0x%lx\n",i,ECQ[i],ECQ[i]); //DEBUG
+            writeBits_Fast(outBuf,&bitPos,bp->_1DIdxBits,i);
+            //writeBits_Fast(outBuf,&bitPos,2+ECQBits,((uint64_t)0x11<<ECQBits)|ECQ[i]);
+            //writeBits_Fast(outBuf,&bitPos,2+ECQBits,(ECQ[i]&((uint64_t)0x00<<ECQBits))|((uint64_t)0x01<<ECQBits));
+            //writeBits_Fast(outBuf,&bitPos,1,0);
+            writeBits_Fast(outBuf,&bitPos,1,1);
+            writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
+            break;
+        }
+      }
+      break;
+    }
+    
+    //if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+          
+
+    uint32_t bytePos=(bitPos+7)/8;
+    //*(uint32_t*)(&outBuf[9])=bytePos;
+    *(uint32_t*)(&outBuf[1])=bytePos;
+    
+    if(D_G)printf("bitPos:%ld CSparseBits:%d bytePos:%d CSparseBytes:%d\n",bitPos,CSparseBits,bytePos,CSparseBytes); //DEBUG
+    if(D_G){assert(bitPos==CSparseBits);}
+    
+  //****************************************************************************************
+  //W:CNonSparse
+  }else { 
+    //Includes: mode, indexOffsets, compressedBytes, patternBits, ECQBits,P, S, {ECQ}
+    *numOutBytes=CNonSparseBytes;
+    if(D_G){printf("CNonSparse\n");} //DEBUG
+    if(D_G)printf("ECQBits:%d\n",bp->ECQBits); //DEBUG
+    //if(DEBUG){printf("patternBits:%d _1DIdxBits:%d\n",patternBits,_1DIdxBits);} //DEBUG
+    outBuf[0]=3; //mode
+    
+    ////outBuf bytes [1:8] are indexOffsets, which are already written. outBuf bytes [9:12] are reserved for compressedBytes.
+    //outBuf[13]=patternBits;
+    //outBuf[14]=ECQBits;
+    //bitPos=15*8; //Currently, we are at the end of 15th byte.
+    
+    //outBuf bytes [1:4] are reserved for compressedBytes.
+    outBuf[5]=bp->patternBits;
+    outBuf[6]=bp->ECQBits;
+    bitPos=7*8; //Currently, we are at the end of 7th byte.
+    
+    //if(DEBUG){printf("bitPos_B:%ld\n",bitPos);} //DEBUG
+
+    for(i=0;i<p->sbSize;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->patternBits,patternQ[i]);//Pattern point
+    }
+    //if(DEBUG){printf("bitPos_P:%ld\n",bitPos);} //DEBUG
+    for(i=0;i<p->sbNum;i++){
+      writeBits_Fast(outBuf,&bitPos,bp->scaleBits,scalesQ[i]);//Scale
+    }
+    //if(DEBUG){printf("bitPos_S:%ld\n",bitPos);} //DEBUG
+    //if(DEBUG)printf("ECQBits:%d\n",ECQBits);
+    switch(bp->ECQBits){
+      case 2:
+        for(i=0;i<p->bSize;i++){
+          switch(ECQ[i]){
+            case 0:
+              //if(DEBUG)printf("Index:%d ECQ:%d Written:0x1\n",i,ECQ[i]); //DEBUG
+              writeBits_Fast(outBuf,&bitPos,1,1);//0x1
+              break;
+            case 1:
+              //if(DEBUG)printf("Index:%d ECQ:%d Written:0x00\n",i,ECQ[i]); //DEBUG
+              //writeBits_Fast(outBuf,&bitPos,2,0);//0x00
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              break;
+            case -1:
+              //if(DEBUG)printf("Index:%d ECQ:%d Written:0x01\n",i,ECQ[i]); //DEBUG
+              //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,1);
+              break;
+            default:
+              assert(0);
+              break;
+          }
+        }
+        break;
+      default: //ECQBits>2
+        //if(DEBUG) printf("AMG_W1:bitPos:%ld\n",bitPos); //DEBUG
+        for(i=0;i<p->bSize;i++){
+          //if(DEBUG){printf("AMG_W3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+          //if(DEBUG) printf("AMG_W2:bitPos:%ld\n",bitPos); //DEBUG
+          //if(DEBUG) printf("ECQ[%d]:%ld\n",i,ECQ[i]); //DEBUG
+          switch(ECQ[i]){
+            case 0:
+              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x1\n",i,ECQ[i]); //DEBUG
+              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              writeBits_Fast(outBuf,&bitPos,1,1);  //0x1
+              //wVal=1; writeBits_Fast(outBuf,&bitPos,1,wVal); //0x1
+              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            case 1:
+              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x000\n",i,ECQ[i]); //DEBUG
+              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,3,0); //0x000
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              writeBits_Fast(outBuf,&bitPos,1,0);
+              //wVal=0; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x000
+              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            case -1:
+              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x001\n",i,ECQ[i]); //DEBUG
+              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,3,8); //0x001
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,1); 
+              //wVal=8; writeBits_Fast(outBuf,&bitPos,3,wVal); //0x001
+              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+            default:
+              //if(DEBUG)printf("Index:%d ECQ:%ld Written:0x01 0x%lx\n",i,ECQ[i]); //DEBUG
+              //if(DEBUG){printf("AMG_WB3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&outBuf[bitPos/8]));}; //DEBUG
+              //temp1=bitPos;
+              //writeBits_Fast(outBuf,&bitPos,2,2); //0x01
+              writeBits_Fast(outBuf,&bitPos,1,0); 
+              writeBits_Fast(outBuf,&bitPos,1,1); 
+              //wVal=2; writeBits_Fast(outBuf,&bitPos,2,wVal); //0x01
+              writeBits_Fast(outBuf,&bitPos,bp->ECQBits,ECQ[i]);
+              //if(DEBUG){printf("AMG_WA3:bitPos:%ld buffer[%ld]=0x%lx\n",temp1,temp1/8,*(uint64_t*)(&outBuf[temp1/8]));}; //DEBUG
+              break;
+          }
+        }
+        break;
+    }
+    
+    //if(DEBUG){printf("bitPos_E:%ld\n",bitPos);} //DEBUG
+    if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: ECQBits:%d numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    
+          
+
+    uint32_t bytePos=(bitPos+7)/8;
+    //*(uint32_t*)(&outBuf[9])=bytePos;
+    *(uint32_t*)(&outBuf[1])=bytePos;
+    
+    if(D_G)printf("bitPos:%ld CNonSparseBits:%d bytePos:%d CNonSparseBytes:%d\n",bitPos,CNonSparseBits,bytePos,CNonSparseBytes); //DEBUG
+    if(D_G){assert(bitPos==CNonSparseBits);}
+    
+  }
+  //for(i=213;i<233;i++)if(DEBUG)printf("AMG_WE:bitPos:%d buffer[%d]=0x%lx\n",i*8,i,*(uint64_t*)(&outBuf[i])); //DEBUG
+  
+}
+static inline int pastri_float_Compress(unsigned char*inBuf,pastri_params *p,unsigned char*outBuf,int *numOutBytes){
+  pastri_blockParams bp;
+
+  if(D_G2){printf("Parameters: dataSize:%d\n",p->dataSize);}  //DEBUG
+  if(D_G2){printf("Parameters: bfs:%d %d %d %d originalEb:%.3e\n",p->bf[0],p->bf[1],p->bf[2],p->bf[3],p->usedEb);}  //DEBUG
+  if(D_G2){printf("Parameters: idxRanges:%d %d %d %d\n",p->idxRange[0],p->idxRange[1],p->idxRange[2],p->idxRange[3]);} //DEBUG
+  if(D_G2){printf("Parameters: sbSize:%d sbNum:%d bSize:%d\n",p->sbSize,p->sbNum,p->bSize); }//DEBUG
+  
+  int64_t patternQ[MAX_PS_SIZE];
+  int64_t scalesQ[MAX_PS_SIZE];
+  int64_t ECQ[MAX_BLOCK_SIZE];
+
+  float *data;
+  data=(float*)inBuf;
+  
+  //STEP 0: PREPROCESSING:
+  //This step can include flattening the block, determining the period, etc.
+  //Currently not needed.
+  
+  //STEP 1: PATTERN MATCH
+  pastri_float_PatternMatch(data,p,&bp,patternQ,scalesQ,ECQ);
+  
+  //STEP 2: ENCODING(Include QUANTIZE)
+  pastri_float_Encode(data,patternQ,scalesQ,ECQ,p,&bp,outBuf,numOutBytes);
+  
+
+  return 0;
+}
+
+static inline float pastri_float_InverseQuantization(int64_t q, float binSize){
+  return q*binSize;
+}
+
+static inline void pastri_float_PredictData(pastri_params *p,pastri_blockParams *bp,float *data,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ){
+  int j;
+  float PS_binSize=bp->scalesBinSize*bp->binSize;
+  for(j=0;j<p->bSize;j++){
+    //data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*PS_binSize - ECQ[j]*bp->binSize;
+    data[j]=pastri_float_InverseQuantization(scalesQ[j/p->sbSize]*patternQ[j%p->sbSize],PS_binSize) - pastri_float_InverseQuantization(ECQ[j],bp->binSize);
+  }
+}
+
+static inline void pastri_float_Decode(unsigned char*inBuf,pastri_params *p,pastri_blockParams *bp,unsigned char*outBuf,int *numReadBytes,int64_t* patternQ,int64_t* scalesQ,int64_t* ECQ){
+  int j;
+  bp->_1DIdxBits=bitsNeeded_UI64(p->bSize);
+  //float *data=(float*)(outBuf+p->bSize*8);
+  float *data=(float*)(outBuf);
+  int i0,i1,i2,i3;
+  //uint16_t *idx0,*idx1,*idx2,*idx3;
+  int _1DIdx;
+
+  int64_t ECQTemp;
+  uint64_t bytePos=0;
+  uint64_t bitPos=0;
+  uint64_t temp,temp2;
+  //int sb,localIdx;
+
+  
+  //idx0=(uint16_t*)(outBuf           );
+  //idx1=(uint16_t*)(outBuf+p->bSize*2);
+  //idx2=(uint16_t*)(outBuf+p->bSize*4);
+  //idx3=(uint16_t*)(outBuf+p->bSize*6);
+  //p->idxOffset[0]=*(uint32_t*)(&inBuf[1]);
+  //p->idxOffset[1]=*(uint32_t*)(&inBuf[3]);
+  //p->idxOffset[2]=*(uint32_t*)(&inBuf[5]);
+  //p->idxOffset[3]=*(uint32_t*)(&inBuf[7]);
+  /*
+  for(i0=0;i0<p->idxRange[0];i0++)
+    for(i1=0;i1<p->idxRange[1];i1++)
+      for(i2=0;i2<p->idxRange[2];i2++)
+        for(i3=0;i3<p->idxRange[3];i3++){
+            //_1DIdx=i0*p->idxRange[1]*p->idxRange[2]*p->idxRange[3]+i1*p->idxRange[2]*p->idxRange[3]+i2*p->idxRange[3]+i3;
+            _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+            idx0[_1DIdx]=i0+1+p->idxOffset[0];
+            idx1[_1DIdx]=i1+1+p->idxOffset[1];
+            idx2[_1DIdx]=i2+1+p->idxOffset[2];
+            idx3[_1DIdx]=i3+1+p->idxOffset[3];
+        }
+  */
+  
+  //*numOutBytes=p->bSize*16;  
+  
+  //inBuf[0] is "mode"
+  switch(inBuf[0]){
+    //R:UCSparse
+    case 0:
+      if(D_G){printf("\nDC:UCSparse\n");} //DEBUG
+      //bp->nonZeros=*(uint16_t*)(&inBuf[9]);
+      //bytePos=11;
+      bp->nonZeros=*(uint16_t*)(&inBuf[1]);
+      bytePos=3;
+      for(j=0;j<p->bSize;j++){
+          data[j]=0;
+      }
+      for(j=0;j<bp->nonZeros;j++){
+        //i0=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[0]; //i0
+        i0=*(uint16_t*)(&inBuf[bytePos]); //i0
+        bytePos+=2;
+        //i1=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[1]; //i1
+        i1=*(uint16_t*)(&inBuf[bytePos]); //i1
+        bytePos+=2;
+        //i2=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[2]; //i2
+        i2=*(uint16_t*)(&inBuf[bytePos]); //i2
+        bytePos+=2;
+        //i3=*(uint16_t*)(&inBuf[bytePos])-1-p->idxOffset[3]; //i3
+        i3=*(uint16_t*)(&inBuf[bytePos]); //i3
+        bytePos+=2;
+        _1DIdx=p->idxRange[3]*(i2+p->idxRange[2]*(i1+i0*p->idxRange[1]))+i3;
+        data[_1DIdx]=*(float*)(&inBuf[bytePos]);
+        bytePos+=8; 
+      }
+      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      break;
+    //R:UCNonSparse
+    case 1:
+      if(D_G){printf("\nDC:UCNonSparse\n");} //DEBUG
+      //memcpy(&outBuf[p->bSize*8], &inBuf[9], p->bSize*8);
+      memcpy(data, &inBuf[1], p->bSize*8);
+      bytePos=p->bSize*8;
+      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      break;
+    //R:CSparse
+    case 2:
+      if(D_G){printf("\nDC:CSparse\n");} //DEBUG
+      //for(j=0;j<p->bSize;j++){
+      //  data[j]=0;
+      //}
+      
+      //bp->patternBits=inBuf[13];
+      //bp->ECQBits=inBuf[14];      
+      
+      bp->patternBits=inBuf[5];
+      bp->ECQBits=inBuf[6];
+      
+      if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      
+      //bp->numOutliers=*(uint16_t*)(&inBuf[15]);
+      //bitPos=17*8;
+      bp->numOutliers=*(uint16_t*)(&inBuf[7]);
+      bitPos=9*8;
+      if(D_R){printf("bp->numOutliers:%d\n",bp->numOutliers);} //DEBUG
+
+      bp->scalesBinSize=1/(float)(((uint64_t)1<<(bp->patternBits-1))-1);
+  
+      bp->binSize=p->usedEb*2;
+      
+      if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+
+      for(j=0;j<p->sbSize;j++){
+        patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
+        if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+      }
+      for(j=0;j<p->sbNum;j++){
+        scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
+        if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+      }
+      
+      /* //Splitting
+      for(j=0;j<p->bSize;j++){
+        data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
+      }
+      */
+      for(j=0;j<p->bSize;j++){
+        ECQ[j]=0;
+      }
+      switch(bp->ECQBits){
+        case 2:
+          for(j=0;j<bp->numOutliers;j++){
+            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            
+            _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            ECQTemp=readBits_I64(inBuf,&bitPos,1);
+            ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+            //if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            ////data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
+            ECQ[_1DIdx]=ECQTemp;
+            
+            //if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+        default: //bp->ECQBits>2
+          if(D_C){if(!((bp->ECQBits>=2)||((bp->ECQBits==1) && (bp->numOutliers==0)))){printf("ERROR: bp->ECQBits:%d bp->numOutliers:%d This should not have happened!\n",bp->ECQBits,bp->numOutliers);assert(0);}} //DEBUG
+    
+          for(j=0;j<bp->numOutliers;j++){
+            _1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            //if(DEBUG){printf("temp:%ld\n",temp);} //DEBUG
+            switch(temp){
+              case 0:  //+-1
+                ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                //if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                break;
+              case 1: //Others
+                ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
+                //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+                //if(D_R)printf("R:ECQ[%d]: %ld \n",_1DIdx,ECQTemp);
+                break;
+              //default:
+              //  printf("ERROR: Bad 2-bit value: 0x%lx",temp);
+              // assert(0); //AMG
+              //  break;
+            }
+            
+            //data[_1DIdx]-=ECQTemp*bp->binSize;//Splitting
+            ECQ[_1DIdx]=ECQTemp;
+            
+            //if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+      }
+      //static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,uint64_t numBits){ // numBits must be in range [0:56]
+      //patternQ=(int64_t*)(inBuf+15); 
+      //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
+      
+      bytePos=(bitPos+7)/8;
+      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      
+      //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
+      pastri_float_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
+
+      break;
+    //R:CNonSparse
+    case 3:
+      if(D_G){printf("\nDC:CNonSparse\n");} //DEBUG
+      
+      //for(j=0;j<p->bSize;j++){
+      //  data[j]=0;
+      //}
+      
+      //bp->patternBits=inBuf[13];
+      //bp->ECQBits=inBuf[14];
+      
+      bp->patternBits=inBuf[5];
+      bp->ECQBits=inBuf[6];
+      
+      if(D_R){printf("bp->patternBits:%d bp->ECQBits:%d bp->_1DIdxBits:%d\n",bp->patternBits,bp->ECQBits,bp->_1DIdxBits);} //DEBUG
+      
+      //bitPos=15*8;
+      bitPos=7*8;
+
+      bp->scalesBinSize=1/(float)(((uint64_t)1<<(bp->patternBits-1))-1);
+      bp->binSize=p->usedEb*2;
+      
+      if(D_R){printf("bp->scalesBinSize:%.6e bp->binSize:%.6e bp->scalesBinSize*bp->binSize:%.6e\n",bp->scalesBinSize,bp->binSize,bp->scalesBinSize*bp->binSize);} //DEBUG
+
+      for(j=0;j<p->sbSize;j++){
+        patternQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Pattern point
+        if(D_R){printf("R:patternQ[%d]=%ld\n",j,patternQ[j]);}
+      }
+      for(j=0;j<p->sbNum;j++){
+        scalesQ[j]=readBits_I64(inBuf,&bitPos,bp->patternBits);//Scale
+        if(D_R){printf("R:scalesQ[%d]=%ld\n",j,scalesQ[j]);}
+      }
+      /* //Splitting
+      for(j=0;j<p->bSize;j++){
+        data[j]=scalesQ[j/p->sbSize]*patternQ[j%p->sbSize]*bp->scalesBinSize*bp->binSize;
+        //if(DEBUG){printf("DC:PS[%d]=%.6e\n",j,data[j]);}
+      }
+      */
+      switch(bp->ECQBits){
+        case 2:
+          for(j=0;j<p->bSize;j++){
+            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            switch(temp){
+              case 0:
+                ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                break;
+              case 1:
+                ECQTemp=0;
+                break;
+              default:
+                assert(0);
+                break;
+            }
+            
+            //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            //data[j]-=ECQTemp*bp->binSize; //Splitting
+            ECQ[j]=ECQTemp;
+            
+            //if(DEBUG){printf("decompressed[%d]:%.6e\n",_1DIdx,data[_1DIdx]);} //DEBUG
+          }
+          break;
+        default: //bp->ECQBits>2
+          //if(DEBUG)printf("AMG_R1:bitPos: %ld\n",bitPos);
+          
+          for(j=0;j<p->bSize;j++){
+            //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            //if(DEBUG)printf("AMG_R2:bitPos: %ld\n",bitPos);
+
+            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits));} //DEBUG
+            //if(DEBUG){printf("readBits_UI64:%ld\n",readBits_I64(inBuf,&bitPos,2));} //DEBUG
+            //_1DIdx=readBits_UI64(inBuf,&bitPos,bp->_1DIdxBits);
+            temp=readBits_UI64(inBuf,&bitPos,1);
+            //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+            switch(temp){
+              case 0:
+                //if(DEBUG)printf("Read:0");
+                temp2=readBits_UI64(inBuf,&bitPos,1);
+                switch(temp2){
+                  case 0:
+                    //if(DEBUG)printf("0");
+                    ECQTemp=readBits_I64(inBuf,&bitPos,1);
+                    //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    //if(DEBUG)printf("R:ECQTemp:%ld\n",ECQTemp);
+                    ECQTemp= ((ECQTemp<<63)>>63)|(uint64_t)0x1;
+                    //if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    break;
+                  case 1:
+                    //if(DEBUG)printf("1\n");
+                    ECQTemp=readBits_I64(inBuf,&bitPos,bp->ECQBits);
+                    //if(DEBUG){printf("AMG_R3:bitPos:%ld buffer[%ld]=0x%lx\n",bitPos,bitPos/8,*(uint64_t*)(&inBuf[bitPos/8]));}; //DEBUG
+                    //if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                    break;
+                  default:
+                    assert(0);
+                    break;
+                }
+                break;
+              case 1:
+                //if(DEBUG)printf("Read:1\n");
+                ECQTemp=0;
+                //if(DEBUG)printf("R:ECQ[%d]: %ld\n",j,ECQTemp);
+                break;
+              default:
+                assert(0);
+                break;
+            }
+            
+            //if(DEBUG){printf("_1DIdx:%ld ECQTemp:0x%ld\n",_1DIdx,ECQTemp);} //DEBUG
+            //continue;
+            //sb=_1DIdx/p->sbSize; 
+            //localIdx=_1DIdx%p->sbSize;
+            
+            //data[j]-=ECQTemp*bp->binSize; //Splitting
+            ECQ[j]=ECQTemp;
+            
+            //if(DEBUG){printf("DC:data[%d]:%.6e\n",j,data[j]);} //DEBUG
+          }
+          break;
+      }
+      //static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,uint64_t numBits){ // numBits must be in range [0:56]
+      //patternQ=(int64_t*)(inBuf+15); 
+      //scalesQ=(int64_t*)(inBuf+15+p->sbSize*8);
+      bytePos=(bitPos+7)/8;
+      if(D_G){printf("\nDC:bytePos:%ld\n",bytePos);} //DEBUG
+      
+      //STEP 2: PREDICT DATA(Includes INVERSE QUANTIZATION)
+      pastri_float_PredictData(p,bp,data,patternQ,scalesQ,ECQ);
+      break;
+      
+    default:
+      assert(0);
+      break;
+  } 
+  (*numReadBytes)=bytePos;
+}
+
+static inline void pastri_float_Decompress(unsigned char*inBuf,int dataSize,pastri_params *p,unsigned char*outBuf,int *numReadBytes){
+  int64_t patternQ[MAX_PS_SIZE]; 
+  int64_t scalesQ[MAX_PS_SIZE];
+  int64_t ECQ[MAX_BLOCK_SIZE];
+  
+  pastri_blockParams bp;
+  
+  //STEP 1: DECODE (Includes PREDICT DATA(Includes INVERSE QUANTIZATION))
+  //(Further steps are called inside pastri_float_Decode function)
+  pastri_float_Decode(inBuf,p,&bp,outBuf,numReadBytes,patternQ,scalesQ,ECQ);
+
+  return;
+}
+
+//inBuf vs Decompressed
+static inline int pastri_float_Check(unsigned char*inBuf,int dataSize,unsigned char*DC,pastri_params *p){
+  int i;
+  
+  float *data=(float*)(inBuf);
+  float *data_dc=(float*)(DC);
+  
+  //Comparing Indexes:
+  /*
+  for(i=0;i<p->bSize;i++){
+    if(idx0[i]!=idx0_dc[i]){
+      printf("idx0[%d]=%d  !=  %d=idx0_dc[%d]",i,idx0[i],idx0_dc[i],i);
+      assert(0);
+    }
+    if(idx1[i]!=idx1_dc[i]){
+      printf("idx1[%d]=%d  !=  %d=idx1_dc[%d]",i,idx1[i],idx1_dc[i],i);
+      assert(0);
+    }
+    if(idx2[i]!=idx2_dc[i]){
+      printf("idx2[%d]=%d  !=  %d=idx2_dc[%d]",i,idx2[i],idx2_dc[i],i);
+      assert(0);
+    }
+    if(idx3[i]!=idx3_dc[i]){
+      printf("idx3[%d]=%d  !=  %d=idx3_dc[%d]",i,idx3[i],idx3_dc[i],i);
+      assert(0);
+    }
+  }
+  */
+  
+  //Comparing Data:
+  for(i=0;i<p->bSize;i++){
+    if(abs_FastD(data[i]-data_dc[i])>p->usedEb){
+      printf("|data[%d]-data_dc[%d]|>originalEb : %.3e - %.3e = %.3e > %.3e\n",i,i,data[i],data_dc[i],abs_FastD(data[i]-data_dc[i]),p->usedEb);
+      assert(0);
+    }
+  }
+  return 0;
+}
+
+
+#endif
diff --git a/thirdparty/SZ/sz/include/pastriGeneral.h b/thirdparty/SZ/sz/include/pastriGeneral.h
new file mode 100644
index 0000000000000000000000000000000000000000..81149256d65d05f8626418dc360644202550e44e
--- /dev/null
+++ b/thirdparty/SZ/sz/include/pastriGeneral.h
@@ -0,0 +1,205 @@
+#ifndef PASTRIGENERAL_H
+#define PASTRIGENERAL_H
+
+
+static inline double abs_FastD(double x){
+  u_UI64I64D u1;
+  u1.d=x;
+  //(*((uint64_t *)(&x)))&=(int64_t)0x7FFFFFFFFFFFFFFF;
+  u1.ui64&=(int64_t)0x7FFFFFFFFFFFFFFF;
+  return u1.d;
+}
+
+static inline int64_t abs_FastI64(int64_t x){
+  return (x^((x&(int64_t)0x8000000000000000)>>63))+((x&(int64_t)0x8000000000000000)!=0);
+}
+/*
+int abs(int x) {
+   int mask = (x >> (sizeof(int) * CHAR_BIT - 1));
+   return (x + mask) ^ mask;
+}
+*/
+
+
+
+
+//Returns the min. bits needed to represent x.
+//Same as: ceil(log2(abs(x))) 
+//Actually to be completely safe, it correspond to: ceil(log2(abs(i)+1))+0.1
+//+0.1 was for fixing rounding errors
+//REMEMBER: To represent the whole range [-x:x], the number of bits required is bitsNeeded(x)+1
+static inline int bitsNeeded_double(double x){
+  u_UI64I64D u1;
+  u1.d=x;
+  return (((u1.ui64<<1)>>53)-1022) & (((x!=0)<<31)>>31);
+}
+
+//Returns the min. bits needed to represent x.
+//Same as: ceil(log2(abs(x))) 
+//NEEDS OPTIMIZATION!
+static inline int bitsNeeded_float(float x){
+  u_UI64I64D u1;
+  u1.d=x; //Casting to Double!
+  return (((u1.ui64<<1)>>53)-1022) & (((x!=0)<<31)>>31);
+}
+
+static inline int bitsNeeded_UI64(uint64_t x){
+  int shift;
+  int res=0;
+  
+  //Get the absolute value of x:
+  //x=(x^((x&(int64_t)0x8000000000000000)>>63))+((x&(int64_t)0x8000000000000000)!=0);
+  //x=abs_FastI64(x);
+  
+  //printf("%d\n",(x&(uint64_t)0xFFFFFFFF00000000)!=0);
+  shift=(((x&(uint64_t)0xFFFFFFFF00000000)!=0)*32);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x00000000FFFF0000)!=0);
+  shift=(((x&(uint64_t)0x00000000FFFF0000)!=0)*16);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x000000000000FF00)!=0);
+  shift=(((x&(uint64_t)0x000000000000FF00)!=0)*8);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x00000000000000F0)!=0);
+  shift=(((x&(uint64_t)0x00000000000000F0)!=0)*4);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x000000000000000C)!=0);
+  shift=(((x&(uint64_t)0x000000000000000C)!=0)*2);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x0000000000000002)!=0);
+  shift=((x&(uint64_t)0x0000000000000002)!=0);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("%d\n",(x&(uint64_t)0x0000000000000001)!=0);
+  shift=((x&(uint64_t)0x0000000000000001)!=0);
+  x>>=shift;
+  res+=shift;
+  
+  //printf("BITS NEEDED: %d\n",res);
+  return res;
+}
+
+static inline int bitsNeeded_I64(int64_t x){
+  uint64_t ux;
+  ux=abs_FastI64(x);
+  return bitsNeeded_UI64(ux);
+}
+
+//Implementations(They are inline, so they should be in this header file)
+
+static inline int myEndianType(){ //Should work for most cases. May not work at mixed endian systems.
+  uint64_t n=1;
+  if (*(unsigned char*)&n == 1){
+    //cout<<"Little-Endian"<<endl;
+    return 0;  //0 for little endian
+  }
+  else{
+    //cout<<"Big-Endian"<<endl;
+    return 1; //1 for big endian
+  }
+}
+
+static inline void flipBytes_UI64(uint64_t *dataPtr){
+  unsigned char*tempA;
+  char temp8b;
+  tempA=(unsigned char*)dataPtr;
+  temp8b=tempA[7];
+  tempA[7]=tempA[0];
+  tempA[0]=temp8b;
+  temp8b=tempA[6];
+  tempA[6]=tempA[1];
+  tempA[1]=temp8b;
+  temp8b=tempA[5];
+  tempA[5]=tempA[2];
+  tempA[2]=temp8b;
+  temp8b=tempA[4];
+  tempA[4]=tempA[3];
+  tempA[3]=temp8b;
+  return;
+}
+
+//WARNING: readBits works properly only on Little Endian machines! (For Big Endians, some modifications are needed)
+
+static inline uint64_t readBits_UI64(unsigned char* buffer,uint64_t *bitPosPtr,char numBits){ // numBits must be in range [0:56]
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    //cout<<"bitPos:"<<(*bitPosPtr)<<"\tbitPos>>3:"<<(*bitPosPtr>>3)<<endl;
+    uint64_t temp64b = *(uint64_t*)(buffer + ( *bitPosPtr >> 3)); 
+    //NOTE: bitPos>>3 is the same as bitPos/8
+    temp64b >>= (*bitPosPtr) & (uint64_t)0x0000000000000007;
+    
+    //cout<<endl;
+    //cout<<"bitpos>>3:"<<(bitPos>>3)<<" bitPos&0x7:"<<(bitPos & 0x00000007)<<" bitPos%8:"<<(bitPos%8)<<endl;
+    //cout<<"Read:"<<(temp64b & mask)<<" temp64b:"<<temp64b<<" Mask:"<<mask<<" numBits:"<<numBits<<endl;
+    
+    (*bitPosPtr) += numBits;
+    return (temp64b & mask);
+}
+
+static inline int64_t readBits_I64(unsigned char* buffer,uint64_t *bitPosPtr,char numBits){ // numBits must be in range [0:56]
+  int64_t val;
+  val=readBits_UI64(buffer,bitPosPtr,numBits);//Read value
+  int64_t shiftAmount=64-numBits;
+  val=(val<<shiftAmount)>>shiftAmount;//Sign correction
+  return val;
+}
+
+//WARNING: readBits_EndianSafe is not tested on Big-Endian machines
+static inline uint64_t readBits_EndianSafe(unsigned char* buffer,uint64_t *bitPosPtr,char numBits){ // numBits must be in range [0:56]
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    uint64_t temp64b = *(uint64_t*)(buffer + ((*bitPosPtr)>>3)); 
+    //NOTE: (*bitPosPtr)>>3 is the same as (*bitPosPtr)/8
+    if(myEndianType())
+      flipBytes_UI64(&temp64b);
+    temp64b >>= (*bitPosPtr) & (uint64_t)0x0000000000000007;
+    (*bitPosPtr) += numBits;
+    return temp64b & mask;
+}
+
+//WARNING: writeBits_Fast works properly only on Little Endian machines! (For Big Endians, some modifications are needed)
+//The buffer should be initialized as 0's for this to work!
+//Also, the range of data is not checked!(If data exceeds numBits, it may be cause problems)
+static inline void writeBits_Fast(unsigned char* buffer,uint64_t *bitPosPtr,char numBits,int64_t data){
+    //if(DEBUG){printf("writeBits_Fast: data:0x%lx %ld\n",data,data);} //DEBUG
+    //if(DEBUG){printf("writeBits_Fast: numBits:0x%lx %ld\n",numBits,numBits);} //DEBUG
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    //if(DEBUG){printf("writeBits_Fast: mask:0x%lx %ld\n",mask,mask);} //DEBUG
+    //if(DEBUG){printf("writeBits_Fast: data&mask:0x%lx %ld\n",((*(uint64_t*)&data)&mask),((*(uint64_t*)&data)&mask));} //DEBUG
+    
+    //if(DEBUG){printf("writeBits_Fast: buffer_O:0x%lx\n",*(uint64_t*)(buffer + ((*bitPosPtr)>>3)));} //DEBUG
+    *(uint64_t*)(buffer + ((*bitPosPtr)>>3)) |= ((*(uint64_t*)&data)&mask) << ((*bitPosPtr) & (uint64_t)0x0000000000000007);
+    //if(DEBUG){printf("writeBits_Fast: buffer_N:0x%lx\n",*(uint64_t*)(buffer + ((*bitPosPtr)>>3)));} //DEBUG
+
+    
+    (*bitPosPtr) += numBits;
+}
+
+//WARNING: writeBits_EndianSafe is not tested on Big-Endian machines
+static inline void writeBits_EndianSafe(unsigned char* buffer,uint64_t *bitPosPtr,char numBits,uint64_t data){
+    uint64_t mask = ((uint64_t)0x0000000000000001<<numBits)-1;
+    data=data&mask;
+    uint64_t temp64b_inBuffer=*(uint64_t*)(buffer + ((*bitPosPtr)>>3));
+    uint64_t temp64b_outBuffer=data << ((*bitPosPtr) & (uint64_t)0x0000000000000007);
+    if(myEndianType()){
+      flipBytes_UI64(&temp64b_inBuffer);
+    }
+    temp64b_outBuffer |= temp64b_inBuffer;
+    if(myEndianType()){
+      flipBytes_UI64(&temp64b_outBuffer);
+    }
+    *(uint64_t*)(buffer + ((*bitPosPtr)>>3))=temp64b_outBuffer;  // "|=" may also work
+    (*bitPosPtr) += numBits;
+}
+
+
+#endif
diff --git a/thirdparty/SZ/sz/include/rw.h b/thirdparty/SZ/sz/include/rw.h
new file mode 100644
index 0000000000000000000000000000000000000000..846243de5e0fe58a266e8f62f487649b60cb2ebb
--- /dev/null
+++ b/thirdparty/SZ/sz/include/rw.h
@@ -0,0 +1,89 @@
+/**
+ *  @file io.h
+ *  @author Sheng Di
+ *  @date April, 2015
+ *  @brief Header file for the whole io interface.
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _IO_H
+#define _IO_H
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef _WIN32
+#define PATH_SEPARATOR ';'
+#else
+#define PATH_SEPARATOR ':'
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int checkFileExistance(char* filePath);
+
+float** create2DArray_float(size_t m, size_t n);
+void free2DArray_float(float** data, size_t m);
+float*** create3DArray_float(size_t p, size_t m, size_t n);
+void free3DArray_float(float*** data, size_t p, size_t m);
+double** create2DArray_double(size_t m, size_t n);
+void free2DArray_double(double** data, size_t m);
+double*** create3DArray_double(size_t p, size_t m, size_t n);
+void free3DArray_double(double*** data, size_t p, size_t m);
+size_t checkFileSize(char *srcFilePath, int *status);
+
+unsigned char *readByteData(char *srcFilePath, size_t *byteLength, int *status);
+double *readDoubleData(char *srcFilePath, size_t *nbEle, int *status);
+int8_t *readInt8Data(char *srcFilePath, size_t *nbEle, int *status);
+int16_t *readInt16Data(char *srcFilePath, size_t *nbEle, int *status);
+uint16_t *readUInt16Data(char *srcFilePath, size_t *nbEle, int *status);
+int32_t *readInt32Data(char *srcFilePath, size_t *nbEle, int *status);
+uint32_t *readUInt32Data(char *srcFilePath, size_t *nbEle, int *status);
+int64_t *readInt64Data(char *srcFilePath, size_t *nbEle, int *status);
+uint64_t *readUInt64Data(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData(char *srcFilePath, size_t *nbEle, int *status);
+unsigned short* readShortData(char *srcFilePath, size_t *dataLength, int *status);
+
+double *readDoubleData_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int8_t *readInt8Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int16_t *readInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint16_t *readUInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int32_t *readInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint32_t *readUInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+int64_t *readInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+uint64_t *readUInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+float *readFloatData_systemEndian(char *srcFilePath, size_t *nbEle, int *status);
+
+void writeByteData(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status);
+void writeDoubleData(double *data, size_t nbEle, char *tgtFilePath, int *status);
+void writeFloatData(float *data, size_t nbEle, char *tgtFilePath, int *status);
+void writeData(void *data, int dataType, size_t nbEle, char *tgtFilePath, int *status);
+void writeFloatData_inBytes(float *data, size_t nbEle, char* tgtFilePath, int *status);
+void writeDoubleData_inBytes(double *data, size_t nbEle, char* tgtFilePath, int *status);
+void writeShortData_inBytes(short *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeUShortData_inBytes(unsigned short *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeIntData_inBytes(int *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeUIntData_inBytes(unsigned int *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeLongData_inBytes(int64_t *states, size_t stateLength, char *tgtFilePath, int *status);
+void writeULongData_inBytes(uint64_t *states, size_t stateLength, char *tgtFilePath, int *status);
+
+void writeStrings(int nbStr, char *str[], char *tgtFilePath, int *status);
+
+//void convertToPFM_float(float *data, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, int endianType, char *tgtFilePath, int *status);
+
+void checkfilesizec_(char *srcFilePath, int *len, size_t *filesize);
+void readbytefile_(char *srcFilePath, int *len, unsigned char *bytes, size_t *byteLength);
+void readdoublefile_(char *srcFilePath, int *len, double *data, size_t *nbEle);
+void readfloatfile_(char *srcFilePath, int *len, float *data, size_t *nbEle);
+void writebytefile_(unsigned char *bytes, size_t *byteLength, char *tgtFilePath, int *len);
+void writedoublefile_(double *data, size_t *nbEle, char *tgtFilePath, int *len);
+void writefloatfile_(float *data, size_t *nbEle, char *tgtFilePath, int *len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _IO_H  ----- */
diff --git a/thirdparty/SZ/sz/include/sz.h b/thirdparty/SZ/sz/include/sz.h
new file mode 100644
index 0000000000000000000000000000000000000000..31c326091118500114a3299759b6387fb6f2702e
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz.h
@@ -0,0 +1,379 @@
+/**
+ *  @file sz.h
+ *  @author Sheng Di
+ *  @date April, 2015
+ *  @brief Header file for the whole detector.
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_H
+#define _SZ_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/time.h>      /* For gettimeofday(), in microseconds */
+#include <time.h>          /* For time(), in seconds */
+#include "iniparser.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "VarSet.h"
+#include "Huffman.h"
+#include "TightDataPointStorageD.h"
+#include "TightDataPointStorageF.h"
+#include "TightDataPointStorageI.h"
+#include "conf.h"
+#include "dataCompression.h"
+#include "ByteToolkit.h"
+#include "TypeManager.h"
+#include "sz_int8.h"
+#include "sz_int16.h"
+#include "sz_int32.h"
+#include "sz_int64.h"
+#include "sz_uint8.h"
+#include "sz_uint16.h"
+#include "sz_uint32.h"
+#include "sz_uint64.h"
+#include "sz_float.h"
+#include "sz_double.h"
+#include "szd_int8.h"
+#include "szd_int16.h"
+#include "szd_int32.h"
+#include "szd_int64.h"
+#include "szd_uint8.h"
+#include "szd_uint16.h"
+#include "szd_uint32.h"
+#include "szd_uint64.h"
+#include "szd_float.h"
+#include "szd_double.h"
+#include "sz_float_pwr.h"
+#include "sz_double_pwr.h"
+#include "callZlib.h"
+#include "rw.h"
+#include "pastri.h"
+#include "sz_float_ts.h"
+#include "szd_float_ts.h"
+
+#ifdef _WIN32
+#define PATH_SEPARATOR ';'
+#else
+#define PATH_SEPARATOR ':'
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//typedef char int8_t;
+//typedef unsigned char uint8_t;
+//typedef short int16_t;
+//typedef unsigned short uint16_t;
+//typedef int int32_t;
+//typedef unsigned int uint32_t;
+//typedef long int64_t;
+//typedef unsigned long uint64_t;
+
+#define SZ_VERNUM 0x0140
+#define SZ_VER_MAJOR 1
+#define SZ_VER_MINOR 4
+#define SZ_VER_BUILD 13
+#define SZ_VER_REVISION 5
+
+#define PASTRI 103
+#define HZ 102
+#define SZ 101
+
+//prediction mode of temporal dimension based compression
+#define SZ_PREVIOUS_VALUE_ESTIMATE 0
+
+#define MIN_NUM_OF_ELEMENTS 20 //if the # elements <= 20, skip the compression
+
+#define ABS 0
+#define REL 1
+#define ABS_AND_REL 2
+#define ABS_OR_REL 3
+#define PSNR 4
+
+#define PW_REL 10
+#define ABS_AND_PW_REL 11
+#define ABS_OR_PW_REL 12
+#define REL_AND_PW_REL 13
+#define REL_OR_PW_REL 14
+
+#define SZ_FLOAT 0
+#define SZ_DOUBLE 1
+#define SZ_UINT8 2
+#define SZ_INT8 3
+#define SZ_UINT16 4
+#define SZ_INT16 5
+#define SZ_UINT32 6
+#define SZ_INT32 7
+#define SZ_UINT64 8
+#define SZ_INT64 9
+
+#define LITTLE_ENDIAN_DATA 0 //refers to the endian type of the data read from the disk
+#define BIG_ENDIAN_DATA 1 //big_endian (ppc, max, etc.) ; little_endian (x86, x64, etc.)
+
+#define LITTLE_ENDIAN_SYSTEM 0 //refers to the endian type of the system
+#define BIG_ENDIAN_SYSTEM 1
+
+#define DynArrayInitLen 1024
+
+#define MIN_ZLIB_DEC_ALLOMEM_BYTES 1000000
+
+//#define maxRangeRadius 32768
+//#define maxRangeRadius 1048576//131072
+
+#define SZ_BEST_SPEED 0
+#define SZ_BEST_COMPRESSION 1
+#define SZ_DEFAULT_COMPRESSION 2
+#define SZ_TEMPORAL_COMPRESSION 3
+
+#define SZ_PWR_MIN_TYPE 0
+#define SZ_PWR_AVG_TYPE 1
+#define SZ_PWR_MAX_TYPE 2
+
+//SUCCESS returning status
+#define SZ_SCES 0  //successful
+#define SZ_NSCS -1 //Not successful
+#define SZ_FERR -2 //Failed to open input file
+#define SZ_TERR -3 //wrong data type (should be only float or double)
+#define SZ_DERR -4 //dimension error
+#define SZ_MERR -5 //sz_mode error
+#define SZ_BERR -6 //bound-mode error (should be only ABS, REL, ABS_AND_REL, ABS_OR_REL, or PW_REL)
+
+#define SZ_MAINTAIN_VAR_DATA 0
+#define SZ_DESTROY_WHOLE_VARSET 1
+
+#define GROUP_COUNT 16 //2^{16}=65536
+	
+#define MetaDataByteLength 20	
+	
+#define numOfBufferedSteps 1 //the number of time steps in the buffer	
+	
+//Note: the following setting should be consistent with stateNum in Huffman.h
+//#define intvCapacity 65536
+//#define intvRadius 32768
+//#define intvCapacity 131072
+//#define intvRadius 65536
+
+#define SZ_COMPUTE_1D_NUMBER_OF_BLOCKS( COUNT, NUM_BLOCKS, BLOCK_SIZE ) \
+    if (COUNT <= BLOCK_SIZE){                  \
+        NUM_BLOCKS = 1;             \
+    }                                   \
+    else{                               \
+        NUM_BLOCKS = COUNT / BLOCK_SIZE;       \
+    }                                   \
+
+#define SZ_COMPUTE_2D_NUMBER_OF_BLOCKS( COUNT, NUM_BLOCKS, BLOCK_SIZE ) \
+    if (COUNT <= BLOCK_SIZE){                   \
+        NUM_BLOCKS = 1;             \
+    }                                   \
+    else{                               \
+        NUM_BLOCKS = COUNT / BLOCK_SIZE;        \
+    }                                   \
+
+#define SZ_COMPUTE_3D_NUMBER_OF_BLOCKS( COUNT, NUM_BLOCKS, BLOCK_SIZE ) \
+    if (COUNT <= BLOCK_SIZE){                   \
+        NUM_BLOCKS = 1;             \
+    }                                   \
+    else{                               \
+        NUM_BLOCKS = COUNT / BLOCK_SIZE;        \
+    }                                   \
+
+#define SZ_COMPUTE_BLOCKCOUNT( COUNT, NUM_BLOCKS, SPLIT_INDEX,       \
+                                       EARLY_BLOCK_COUNT, LATE_BLOCK_COUNT ) \
+    EARLY_BLOCK_COUNT = LATE_BLOCK_COUNT = COUNT / NUM_BLOCKS;               \
+    SPLIT_INDEX = COUNT % NUM_BLOCKS;                                        \
+    if (0 != SPLIT_INDEX) {                                                  \
+        EARLY_BLOCK_COUNT = EARLY_BLOCK_COUNT + 1;                           \
+    }                                                                        \
+
+//typedef unsigned long unsigned long;
+//typedef unsigned int uint;
+
+typedef union lint16
+{
+	unsigned short usvalue;
+	short svalue;
+	unsigned char byte[2];
+} lint16;
+
+typedef union lint32
+{
+	int ivalue;
+	unsigned int uivalue;
+	unsigned char byte[4];
+} lint32;
+
+typedef union lint64
+{
+	long lvalue;
+	unsigned long ulvalue;
+	unsigned char byte[8];
+} lint64;
+
+typedef union ldouble
+{
+    double value;
+    unsigned long lvalue;
+    unsigned char byte[8];
+} ldouble;
+
+typedef union lfloat
+{
+    float value;
+    unsigned int ivalue;
+    unsigned char byte[4];
+} lfloat;
+
+/* array meta data and compression parameters for SZ_Init_Params() */
+typedef struct sz_params
+{
+	int dataType;
+	unsigned int max_quant_intervals; //max number of quantization intervals for quantization
+	unsigned int quantization_intervals; 
+	unsigned int maxRangeRadius;
+	int sol_ID;// it's always SZ, unless the setting is PASTRI compression mode (./configure --enable-pastri)
+	int sampleDistance; //2 bytes
+	float predThreshold;  // 2 bytes
+	int szMode; //* 0 (best speed) or 1 (better compression with Gzip) or 3 temporal-dimension based compression
+	int gzipMode; //* four options: Z_NO_COMPRESSION, or Z_BEST_SPEED, Z_BEST_COMPRESSION, Z_DEFAULT_COMPRESSION
+	int  errorBoundMode; //4bits (0.5byte), //ABS, REL, ABS_AND_REL, or ABS_OR_REL, PSNR, or PW_REL, PSNR
+	double absErrBound; //absolute error bound
+	double relBoundRatio; //value range based relative error bound ratio
+	double psnr; //PSNR
+	double pw_relBoundRatio; //point-wise relative error bound
+	int segment_size; //only used for 2D/3D data compression with pw_relBoundRatio
+	int pwr_type; //only used for 2D/3D data compression with pw_relBoundRatio
+	
+	int snapshotCmprStep; //perform single-snapshot-based compression if time_step == snapshotCmprStep
+	int predictionMode;
+} sz_params;
+
+typedef struct sz_metadata
+{
+	int versionNumber[3]; //only used for checking the version by calling SZ_GetMetaData()
+	int isConstant; //only used for checking if the data are constant values by calling SZ_GetMetaData()
+	int isLossless; //only used for checking if the data compression was lossless, used only by calling SZ_GetMetaData()
+	int sizeType; //only used for checking whether the size type is "int" or "long" in the compression, used only by calling SZ_GetMetaData()
+	size_t dataSeriesLength; //# number of data points in the dataset
+	int defactoNBBins; //real number of quantization bins
+	struct sz_params* conf_params; //configuration parameters
+} sz_metadata;
+
+typedef struct sz_exedata
+{
+	char optQuantMode;	//opt Quantization (0: fixed ; 1: optimized)	
+	int intvCapacity; // the number of intervals for the linear-scaling quantization
+	int intvRadius;  // the number of intervals for the radius of the quantization range (intvRadius=intvCapacity/2)
+	int SZ_SIZE_TYPE; //the length (# bytes) of the size_t in the system at runtime //4 or 8: sizeof(size_t) 
+} sz_exedata;
+
+/*We use a linked list to maintain time-step meta info for time-step based compression*/
+typedef struct sz_tsc_metainfo
+{
+	int totalNumOfSteps;
+	int currentStep;
+	char metadata_filename[256];
+	FILE *metadata_file;
+} sz_tsc_metadata;
+
+extern int versionNumber[4];
+
+//-------------------key global variables--------------
+extern int dataEndianType; //*endian type of the data read from disk
+extern int sysEndianType; //*sysEndianType is actually set automatically.
+
+extern sz_params *confparams_cpr;
+extern sz_params *confparams_dec;
+extern sz_exedata *exe_params;
+//------------------------------------------------
+extern SZ_VarSet* sz_varset;
+extern sz_multisteps *multisteps; //compression based on multiple time steps (time-dimension based compression)
+extern sz_tsc_metadata *sz_tsc;
+
+//for pastri 
+#ifdef PASTRI
+extern pastri_params pastri_par; 
+#endif
+
+//sz.h
+HuffmanTree* SZ_Reset();
+
+int SZ_Init(const char *configFilePath);
+
+int SZ_Init_Params(sz_params *params);
+
+size_t computeDataLength(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int computeDimension(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_compress_args_float_subblock(unsigned char* compressedBytes, float *oriData,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1,
+size_t *outSize, int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_double_subblock(unsigned char* compressedBytes, double *oriData,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1,
+size_t *outSize, int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+unsigned char *SZ_compress(int dataType, void *data, size_t *outSize, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+unsigned char* SZ_compress_args(int dataType, void *data, size_t *outSize, int errBoundMode, double absErrBound, 
+double relBoundRatio, double pwrBoundRatio, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_compress_args2(int dataType, void *data, unsigned char* compressed_bytes, size_t *outSize, 
+int errBoundMode, double absErrBound, double relBoundRatio, double pwrBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_compress_args3(int dataType, void *data, unsigned char* compressed_bytes, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1);
+
+unsigned char *SZ_compress_rev_args(int dataType, void *data, void *reservedValue, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+int SZ_compress_rev_args2(int dataType, void *data, void *reservedValue, unsigned char* compressed_bytes, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+unsigned char *SZ_compress_rev(int dataType, void *data, void *reservedValue, size_t *outSize, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+void SZ_Create_ParamsExe(sz_params** conf_params, sz_exedata** exe_params);
+
+void *SZ_decompress(int dataType, unsigned char *bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+size_t SZ_decompress_args(int dataType, unsigned char *bytes, size_t byteLength, void* decompressed_array, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+sz_metadata* SZ_getMetadata(unsigned char* bytes);
+void SZ_printMetadata(sz_metadata* metadata);
+
+
+void filloutDimArray(size_t* dim, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+
+size_t compute_total_batch_size();
+
+int isZlibFormat(unsigned char magic1, unsigned char magic2);
+
+void SZ_registerVar(char* varName, int dataType, void* data, 
+			int errBoundMode, double absErrBound, double relBoundRatio, double pwRelBoundRatio, 
+			size_t r5, size_t r4, size_t r3, size_t r2, size_t r1);
+int SZ_deregisterVar(char* varName);
+int SZ_deregisterAllVars();
+
+int SZ_compress_ts(unsigned char** newByteData, size_t *outSize);
+void SZ_decompress_ts(unsigned char *bytes, size_t byteLength);
+
+void SZ_Finalize();
+
+void convertSZParamsToBytes(sz_params* params, unsigned char* result);
+sz_params* convertBytesToSZParams(unsigned char* bytes);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_H  ----- */
diff --git a/thirdparty/SZ/sz/include/sz_double.h b/thirdparty/SZ/sz/include/sz_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..b186d12d0afee51724debbcbf5ad40e1183bc361
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_double.h
@@ -0,0 +1,83 @@
+/**
+ *  @file sz_double.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_double.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Double_H
+#define _SZ_Double_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+unsigned char* SZ_skip_compress_double(double* data, size_t dataLength, size_t* outSize);
+
+void computeReqLength_double(double realPrecision, short radExpo, int* reqLength, double* medianValue);
+unsigned int optimize_intervals_double_1D(double *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_double_2D(double *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_double_3D(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_double_4D(double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+
+unsigned int optimize_intervals_double_3D_opt(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_double_2D_opt(double *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_double_1D_opt(double *oriData, size_t dataLength, double realPrecision);
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ(double *oriData, 
+size_t dataLength, double realPrecision, double valueRangeSize, double medianValue_d);
+void SZ_compress_args_double_StoreOriData(double* oriData, size_t dataLength, TightDataPointStorageD* tdps, unsigned char** newByteData, size_t *outSize);
+
+char SZ_compress_args_double_NoCkRngeNoGzip_1D(unsigned char** newByteData, double *oriData, size_t dataLength, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d);
+
+TightDataPointStorageD* SZ_compress_double_2D_MDQ(double *oriData, size_t r1, size_t r2, double realPrecision, double valueRangeSize, double medianValue_d);
+char SZ_compress_args_double_NoCkRngeNoGzip_2D(unsigned char** newByteData, double *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d);
+
+TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, double valueRangeSize, double medianValue_d);
+char SZ_compress_args_double_NoCkRngeNoGzip_3D(unsigned char** newByteData, double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d);
+
+TightDataPointStorageD* SZ_compress_double_4D_MDQ(double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, double valueRangeSize, double medianValue_d);
+char SZ_compress_args_double_NoCkRngeNoGzip_4D(unsigned char** newByteData, double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d);
+
+void SZ_compress_args_double_withinRange(unsigned char** newByteData, double *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_double_wRngeNoGzip(unsigned char** newByteData, double *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwrErrRatio);
+
+int SZ_compress_args_double(unsigned char** newByteData, double *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRatio);
+
+void SZ_compress_args_double_NoCkRnge_1D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r1, size_t s1, size_t e1);
+void SZ_compress_args_double_NoCkRnge_2D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r2, size_t r1, size_t s2, size_t s1, size_t e2, size_t e1);
+void SZ_compress_args_double_NoCkRnge_3D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r3, size_t r2, size_t r1, size_t s3, size_t s2, size_t s1, size_t e3, size_t e2, size_t e1);
+void SZ_compress_args_double_NoCkRnge_4D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r4, size_t r3, size_t r2, size_t r1, size_t s4, size_t s3, size_t s2, size_t s1, size_t e4, size_t e3, size_t e2, size_t e1);
+
+unsigned int optimize_intervals_double_1D_subblock(double *oriData, double realPrecision, size_t r1, size_t s1, size_t e1);
+unsigned int optimize_intervals_double_2D_subblock(double *oriData, double realPrecision, size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2);
+unsigned int optimize_intervals_double_3D_subblock(double *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3);
+unsigned int optimize_intervals_double_4D_subblock(double *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t s1, size_t e1);
+TightDataPointStorageD* SZ_compress_double_2D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2);
+TightDataPointStorageD* SZ_compress_double_3D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3);
+TightDataPointStorageD* SZ_compress_double_4D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Double_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/sz_double_pwr.h b/thirdparty/SZ/sz/include/sz_double_pwr.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce81629d089d14b567ba4a6ab3da561cb6ece624
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_double_pwr.h
@@ -0,0 +1,45 @@
+/**
+ *  @file sz_double.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_double.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Double_PWR_H
+#define _SZ_Double_PWR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+void compute_segment_precisions_double_1D(double *oriData, size_t dataLength, double* pwrErrBound, unsigned char* pwrErrBoundBytes, double globalPrecision);
+unsigned int optimize_intervals_double_1D_pwr(double *oriData, size_t dataLength, double* pwrErrBound); 
+void compute_segment_precisions_double_2D(double *oriData, double* pwrErrBound, 
+size_t r1, size_t r2, size_t R2, size_t edgeSize, unsigned char* pwrErrBoundBytes, double Min, double Max, double globalPrecision);
+unsigned int optimize_intervals_double_2D_pwr(double *oriData, size_t r1, size_t r2, size_t R2, size_t edgeSize, double* pwrErrBound);
+void compute_segment_precisions_double_3D(double *oriData, double* pwrErrBound, 
+size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, unsigned char* pwrErrBoundBytes, double Min, double Max, double globalPrecision);
+unsigned int optimize_intervals_double_3D_pwr(double *oriData, size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, double* pwrErrBound);
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, size_t dataLength, size_t *outSize, double min, double max);
+void SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, size_t r1, size_t r2,
+size_t *outSize, double min, double max);
+void SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, 
+size_t r1, size_t r2, size_t r3, size_t *outSize, double min, double max);
+
+void createRangeGroups_double(double** posGroups, double** negGroups, int** posFlags, int** negFlags);
+void compressGroupIDArray_double(char* groupID, TightDataPointStorageD* tdps);
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_pwrGroup(double* oriData, size_t dataLength, int errBoundMode, 
+double absErrBound, double relBoundRatio, double pwrErrRatio, double valueRangeSize, double medianValue_f);
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(unsigned char** newByteData, double *oriData,
+size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio, double valueRangeSize, double medianValue_f, size_t *outSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Double_PWR_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/sz_double_ts.h b/thirdparty/SZ/sz/include/sz_double_ts.h
new file mode 100644
index 0000000000000000000000000000000000000000..581d20ddf58ba77f61b70bd1042a352f482919ef
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_double_ts.h
@@ -0,0 +1,27 @@
+/**
+ *  @file sz_double_ts.h
+ *  @author Sheng Di
+ *  @date May, 2018
+ *  @brief Header file for the sz_double_ts.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "TightDataPointStorageD.h"
+
+#ifndef _SZ_Double_TS_H
+#define _SZ_Double_TS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+unsigned int optimize_intervals_double_1D_ts(double *oriData, size_t dataLength, double* preData, double realPrecision);
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_ts(double *oriData, size_t dataLength, sz_multisteps* multisteps,
+double realPrecision, double valueRangeSize, double medianValue_d);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Double_TS_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/sz_float.h b/thirdparty/SZ/sz/include/sz_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ab92319f5c5aa0c21fdf1ba492611aeeeea1ea0
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_float.h
@@ -0,0 +1,136 @@
+/**
+ *  @file sz_float.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_float.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "DynamicFloatArray.h"
+
+#ifndef _SZ_Float_H
+#define _SZ_Float_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+unsigned char* SZ_skip_compress_float(float* data, size_t dataLength, size_t* outSize);
+
+void computeReqLength_float(double realPrecision, short radExpo, int* reqLength, float* medianValue);
+
+unsigned int optimize_intervals_float_1D(float *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_float_2D(float *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_float_3D(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_float_4D(float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+
+unsigned int optimize_intervals_and_compute_dense_position_float_1D(float *oriData, size_t dataLength, double realPrecision, float * dense_pos);
+unsigned int optimize_intervals_and_compute_dense_position_float_3D(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float * dense_pos);
+unsigned int optimize_intervals_float_3D_with_freq_and_dense_pos(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float * dense_pos, float * max_freq, float * mean_freq);
+unsigned int optimize_intervals_float_3D_opt(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_float_2D_opt(float *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_float_1D_opt(float *oriData, size_t dataLength, double realPrecision);
+
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ(float *oriData, 
+size_t dataLength, double realPrecision, float valueRangeSize, float medianValue_f);
+
+void SZ_compress_args_float_StoreOriData(float* oriData, size_t dataLength, TightDataPointStorageF* tdps, 
+unsigned char** newByteData, size_t *outSize);
+
+char SZ_compress_args_float_NoCkRngeNoGzip_1D(unsigned char** newByteData, float *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f);
+
+TightDataPointStorageF* SZ_compress_float_2D_MDQ(float *oriData, size_t r1, size_t r2, double realPrecision, float valueRangeSize, float medianValue_f);
+
+char SZ_compress_args_float_NoCkRngeNoGzip_2D(unsigned char** newByteData, float *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f);
+
+TightDataPointStorageF* SZ_compress_float_3D_MDQ(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float valueRangeSize, float medianValue_f);
+
+char SZ_compress_args_float_NoCkRngeNoGzip_3D(unsigned char** newByteData, float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f);
+
+size_t SZ_compress_float_1D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t block_dim_0, double realPrecision, int * type, float * unpredictable_data);
+size_t SZ_compress_float_2D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t block_dim_0, size_t block_dim_1, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+
+size_t SZ_compress_float_1D_MDQ_RA_block_1D_pred(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t block_dim_0, double realPrecision, int * type, DynamicFloatArray * unpredictable_data);
+size_t SZ_compress_float_2D_MDQ_RA_block_2D_pred(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t dim_1, size_t block_dim_0, size_t block_dim_1, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block_3D_pred(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block_adaptive(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+//unsigned short SZ_compress_float_3D_MDQ_RA_block_1D_pred(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t dim_1, size_t dim_2, int block_dim_0, int block_dim_1, int block_dim_2, double realPrecision, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block_3D_pred_flush_after_compare(float * block_ori_data, float * mean, float dense_pos, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_RA_block_2_layers(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, float * P_, int * type, float * unpredictable_data);
+size_t SZ_compress_float_3D_MDQ_pred_by_regression(float * block_ori_data, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * reg_params, int * type, float * unpredictable_data);
+void SZ_blocked_regression(float * block_ori_data, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, float *params);
+unsigned char * SZ_compress_float_3D_MDQ_RA_all_by_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+float SZ_compress_float_3D_MDQ_RA_block_no_mean(float * block_ori_data, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, unsigned short * unpred_count, float * unpredictable_data);
+float SZ_compress_float_3D_MDQ_pred_by_regression_with_err(float * block_ori_data, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * reg_params, int * type, unsigned short * unpred_count, float * unpredictable_data);
+unsigned char * SZ_compress_float_3D_MDQ_RA_blocked_with_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+void decompressDataSeries_float_3D_RA_blocked_with_regression(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+
+unsigned char * SZ_compress_float_1D_MDQ_RA(float *oriData, size_t r1, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_2D_MDQ_RA(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_2D_MDQ_nonblocked(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_RA(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_ori(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_multi_means(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_RA_multi_means(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_adaptive(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_3D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+unsigned char * SZ_compress_float_2D_MDQ_nonblocked_with_blocked_regression(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+
+TightDataPointStorageF* SZ_compress_float_4D_MDQ(float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, float valueRangeSize, float medianValue_f);
+
+char SZ_compress_args_float_NoCkRngeNoGzip_4D(unsigned char** newByteData, float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f);
+
+void SZ_compress_args_float_withinRange(unsigned char** newByteData, float *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_float_wRngeNoGzip(unsigned char** newByteData, float *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwrErrRatio);
+
+int SZ_compress_args_float(unsigned char** newByteData, float *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRatio);
+
+int SZ_compress_args_float_subblock(unsigned char* compressedBytes, float *oriData,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1,
+size_t *outSize, int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+void SZ_compress_args_float_NoCkRnge_1D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r1, size_t s1, size_t e1); 
+
+void SZ_compress_args_float_NoCkRnge_2D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r2, size_t r1, size_t s2, size_t s1, size_t e2, size_t e1); 
+
+void SZ_compress_args_float_NoCkRnge_3D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r3, size_t r2, size_t r1, size_t s3, size_t s2, size_t s1, size_t e3, size_t e2, size_t e1); 
+
+void SZ_compress_args_float_NoCkRnge_4D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r4, size_t r3, size_t r2, size_t r1, size_t s4, size_t s3, size_t s2, size_t s1, size_t e4, size_t e3, size_t e2, size_t e1);
+
+unsigned int optimize_intervals_float_1D_subblock(float *oriData, double realPrecision, size_t r1, size_t s1, size_t e1); 
+unsigned int optimize_intervals_float_2D_subblock(float *oriData, double realPrecision, size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2); 
+unsigned int optimize_intervals_float_3D_subblock(float *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3); 
+unsigned int optimize_intervals_float_4D_subblock(float *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t s1, size_t e1); 
+
+TightDataPointStorageF* SZ_compress_float_2D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2); 
+
+TightDataPointStorageF* SZ_compress_float_3D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3); 
+
+TightDataPointStorageF* SZ_compress_float_4D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Float_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/sz_float_pwr.h b/thirdparty/SZ/sz/include/sz_float_pwr.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ef0e7cd858869a7328f4140d37a469c86a0757f
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_float_pwr.h
@@ -0,0 +1,52 @@
+/**
+ *  @file sz_float.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_float.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Float_PWR_H
+#define _SZ_Float_PWR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void compute_segment_precisions_float_1D(float *oriData, size_t dataLength, float* pwrErrBound, unsigned char* pwrErrBoundBytes, double globalPrecision);
+unsigned int optimize_intervals_float_1D_pwr(float *oriData, size_t dataLength, float* pwrErrBound);
+
+void compute_segment_precisions_float_2D(float *oriData, float* pwrErrBound, 
+size_t r1, size_t r2, size_t R2, size_t edgeSize, unsigned char* pwrErrBoundBytes, float Min, float Max, double globalPrecision);
+
+unsigned int optimize_intervals_float_2D_pwr(float *oriData, size_t r1, size_t r2, size_t R2, size_t edgeSize, float* pwrErrBound); 
+
+void compute_segment_precisions_float_3D(float *oriData, float* pwrErrBound, 
+size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, unsigned char* pwrErrBoundBytes, float Min, float Max, double globalPrecision);
+
+unsigned int optimize_intervals_float_3D_pwr(float *oriData, size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, float* pwrErrBound);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, size_t dataLength, size_t *outSize, float min, float max);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, size_t r1, size_t r2, 
+size_t *outSize, float min, float max);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, size_t r1, size_t r2, 
+size_t r3, size_t *outSize, float min, float max);
+
+void createRangeGroups_float(float** posGroups, float** negGroups, int** posFlags, int** negFlags);
+void compressGroupIDArray_float(char* groupID, TightDataPointStorageF* tdps);
+int* generateGroupLowerBounds();
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_pwrGroup(float* oriData, size_t dataLength, int errBoundMode, 
+double absErrBound, double relBoundRatio, double pwrErrRatio, float valueRangeSize, float medianValue_f);
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(unsigned char** newByteData, float *oriData,
+size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio, float valueRangeSize, float medianValue_f, size_t *outSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Float_PWR_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/sz_float_ts.h b/thirdparty/SZ/sz/include/sz_float_ts.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f2301da99ebc426c695d7196df44f320f2dd87e
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_float_ts.h
@@ -0,0 +1,27 @@
+/**
+ *  @file sz_float_ts.h
+ *  @author Sheng Di
+ *  @date May, 2018
+ *  @brief Header file for the sz_float_ts.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+#include "TightDataPointStorageF.h"
+
+#ifndef _SZ_Float_TS_H
+#define _SZ_Float_TS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+unsigned int optimize_intervals_float_1D_ts(float *oriData, size_t dataLength, float* preData, double realPrecision);
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_ts(float *oriData, size_t dataLength, sz_multisteps* multisteps,
+double realPrecision, float valueRangeSize, float medianValue_f);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Float_TS_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/sz_int16.h b/thirdparty/SZ/sz/include/sz_int16.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ad62c5b9038b621d940e6cf3926a206a648d5e3
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_int16.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_int16.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_int16.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Int16_H
+#define _SZ_Int16_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_int16_1D(int16_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_int16_2D(int16_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_int16_3D(int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_int16_4D(int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_int16_1D_MDQ(int16_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int16_StoreOriData(int16_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_int16_NoCkRngeNoGzip_1D(unsigned char** newByteData, int16_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int16_t minValue);
+TightDataPointStorageI* SZ_compress_int16_2D_MDQ(int16_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int16_3D_MDQ(int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int16_NoCkRngeNoGzip_3D(unsigned char** newByteData, int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int16_4D_MDQ(int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int16_NoCkRngeNoGzip_4D(unsigned char** newByteData, int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int16_withinRange(unsigned char** newByteData, int16_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_int16_wRngeNoGzip(unsigned char** newByteData, int16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_int16(unsigned char** newByteData, int16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Int16_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/sz_int32.h b/thirdparty/SZ/sz/include/sz_int32.h
new file mode 100644
index 0000000000000000000000000000000000000000..a87825d0fcfcaa9757581d8ff1f05ea0161ddc7e
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_int32.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_int32.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_int32.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Int32_H
+#define _SZ_Int32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_int32_1D(int32_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_int32_2D(int32_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_int32_3D(int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_int32_4D(int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_int32_1D_MDQ(int32_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int32_StoreOriData(int32_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_int32_NoCkRngeNoGzip_1D(unsigned char** newByteData, int32_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int32_t minValue);
+TightDataPointStorageI* SZ_compress_int32_2D_MDQ(int32_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int32_3D_MDQ(int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int32_NoCkRngeNoGzip_3D(unsigned char** newByteData, int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int32_4D_MDQ(int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int32_NoCkRngeNoGzip_4D(unsigned char** newByteData, int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int32_withinRange(unsigned char** newByteData, int32_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_int32_wRngeNoGzip(unsigned char** newByteData, int32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_int32(unsigned char** newByteData, int32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Int32_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/sz_int64.h b/thirdparty/SZ/sz/include/sz_int64.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7213b2315b551385c5e0c22d3dbd16a07291746
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_int64.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_int64.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_int64.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Int64_H
+#define _SZ_Int64_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_int64_1D(int64_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_int64_2D(int64_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_int64_3D(int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_int64_4D(int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_int64_1D_MDQ(int64_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int64_StoreOriData(int64_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_int64_NoCkRngeNoGzip_1D(unsigned char** newByteData, int64_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int64_2D_MDQ(int64_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int64_3D_MDQ(int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int64_NoCkRngeNoGzip_3D(unsigned char** newByteData, int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int64_4D_MDQ(int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int64_NoCkRngeNoGzip_4D(unsigned char** newByteData, int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int64_withinRange(unsigned char** newByteData, int64_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_int64_wRngeNoGzip(unsigned char** newByteData, int64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_int64(unsigned char** newByteData, int64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Int64_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/sz_int8.h b/thirdparty/SZ/sz/include/sz_int8.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6ce758a79524ffc25e3803f4e03d31119a4f47d
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_int8.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_int8.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_int8.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_Int8_H
+#define _SZ_Int8_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_int8_1D(int8_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_int8_2D(int8_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_int8_3D(int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_int8_4D(int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_int8_1D_MDQ(int8_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int8_StoreOriData(int8_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_int8_NoCkRngeNoGzip_1D(unsigned char** newByteData, int8_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int8_t minValue);
+TightDataPointStorageI* SZ_compress_int8_2D_MDQ(int8_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int8_3D_MDQ(int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int8_NoCkRngeNoGzip_3D(unsigned char** newByteData, int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_int8_4D_MDQ(int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int8_NoCkRngeNoGzip_4D(unsigned char** newByteData, int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_int8_withinRange(unsigned char** newByteData, int8_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_int8_wRngeNoGzip(unsigned char** newByteData, int8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_int8(unsigned char** newByteData, int8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_Int8_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/sz_omp.h b/thirdparty/SZ/sz/include/sz_omp.h
new file mode 100644
index 0000000000000000000000000000000000000000..18d18707c41c5bc68d90a46c7b3342874db3a103
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_omp.h
@@ -0,0 +1,41 @@
+/**
+ *  @file sz_omp.h
+ *  @author Xin Liang
+ *  @date July, 2017
+ *  @brief Header file for the sz_omp.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp.h"
+#include "sz.h"
+
+#ifndef _SZ_OMP_H
+#define _SZ_OMP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned char * SZ_compress_float_1D_MDQ_openmp(float *oriData, size_t r1, double realPrecision, size_t * comp_size);
+
+unsigned char * SZ_compress_float_2D_MDQ_openmp(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size);
+
+unsigned char * SZ_compress_float_3D_MDQ_openmp(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size);
+
+void decompressDataSeries_float_1D_openmp(float** data, size_t r1, unsigned char* comp_data);
+
+void decompressDataSeries_float_3D_openmp(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data);
+
+void decompressDataSeries_float_2D_openmp(float** data, size_t r1, size_t r2, unsigned char* comp_data);
+
+//void Huffman_init_openmp(HuffmanTree* huffmanTree, int *s, size_t length, int thread_num);
+void Huffman_init_openmp(HuffmanTree* huffmanTree, int *s, size_t length, int thread_num, size_t * freq);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_OMP_H  ----- */
diff --git a/thirdparty/SZ/sz/include/sz_uint16.h b/thirdparty/SZ/sz/include/sz_uint16.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb2319772faa95eb2bef6d9db0130a3c52472229
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_uint16.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_uint16.h
+ *  @author Sheng Di
+ *  @date Nov, 2017
+ *  @brief Header file for the sz_uint16.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_UInt16_H
+#define _SZ_UInt16_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_uint16_1D(uint16_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_uint16_2D(uint16_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_uint16_3D(uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_uint16_4D(uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_uint16_1D_MDQ(uint16_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint16_StoreOriData(uint16_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_uint16_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint16_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint16_t minValue);
+TightDataPointStorageI* SZ_compress_uint16_2D_MDQ(uint16_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint16_3D_MDQ(uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint16_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint16_4D_MDQ(uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint16_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint16_withinRange(unsigned char** newByteData, uint16_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_uint16_wRngeNoGzip(unsigned char** newByteData, uint16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_uint16(unsigned char** newByteData, uint16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_UInt16_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/sz_uint32.h b/thirdparty/SZ/sz/include/sz_uint32.h
new file mode 100644
index 0000000000000000000000000000000000000000..8adb31d3fc19446fa8b71dcfb6cdc2b2ea8c9556
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_uint32.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_uint32.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_uint32.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_UInt32_H
+#define _SZ_UInt32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_uint32_1D(uint32_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_uint32_2D(uint32_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_uint32_3D(uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_uint32_4D(uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_uint32_1D_MDQ(uint32_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint32_StoreOriData(uint32_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_uint32_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint32_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint32_t minValue);
+TightDataPointStorageI* SZ_compress_uint32_2D_MDQ(uint32_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint32_3D_MDQ(uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint32_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint32_4D_MDQ(uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint32_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint32_withinRange(unsigned char** newByteData, uint32_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_uint32_wRngeNoGzip(unsigned char** newByteData, uint32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_uint32(unsigned char** newByteData, uint32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_UInt32_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/sz_uint64.h b/thirdparty/SZ/sz/include/sz_uint64.h
new file mode 100644
index 0000000000000000000000000000000000000000..7717aa2d5fa82d9f2415fb50af62d936b9d10bfb
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_uint64.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_uint64.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_uint64.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_UInt64_H
+#define _SZ_UInt64_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_uint64_1D(uint64_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_uint64_2D(uint64_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_uint64_3D(uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_uint64_4D(uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_uint64_1D_MDQ(uint64_t *oriData, size_t dataLength, double realPrecision, uint64_t valueRangeSize, uint64_t minValue);
+void SZ_compress_args_uint64_StoreOriData(uint64_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_uint64_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint64_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, uint64_t valueRangeSize, uint64_t minValue);
+TightDataPointStorageI* SZ_compress_uint64_2D_MDQ(uint64_t *oriData, size_t r1, size_t r2, double realPrecision, uint64_t valueRangeSize, uint64_t minValue);
+TightDataPointStorageI* SZ_compress_uint64_3D_MDQ(uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, uint64_t valueRangeSize, uint64_t minValue);
+void SZ_compress_args_uint64_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, uint64_t valueRangeSize, uint64_t minValue);
+TightDataPointStorageI* SZ_compress_uint64_4D_MDQ(uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, uint64_t valueRangeSize, uint64_t minValue);
+void SZ_compress_args_uint64_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, uint64_t valueRangeSize, uint64_t minValue);
+void SZ_compress_args_uint64_withinRange(unsigned char** newByteData, uint64_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_uint64_wRngeNoGzip(unsigned char** newByteData, uint64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_uint64(unsigned char** newByteData, uint64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_UInt64_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/sz_uint8.h b/thirdparty/SZ/sz/include/sz_uint8.h
new file mode 100644
index 0000000000000000000000000000000000000000..9de3a117b557715fed450978e4b54b36f094e239
--- /dev/null
+++ b/thirdparty/SZ/sz/include/sz_uint8.h
@@ -0,0 +1,48 @@
+/**
+ *  @file sz_uint8.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the sz_uint8.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZ_UInt8_H
+#define _SZ_UInt8_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+unsigned int optimize_intervals_uint8_1D(uint8_t *oriData, size_t dataLength, double realPrecision);
+unsigned int optimize_intervals_uint8_2D(uint8_t *oriData, size_t r1, size_t r2, double realPrecision);
+unsigned int optimize_intervals_uint8_3D(uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision);
+unsigned int optimize_intervals_uint8_4D(uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision);
+TightDataPointStorageI* SZ_compress_uint8_1D_MDQ(uint8_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint8_StoreOriData(uint8_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, unsigned char** newByteData, size_t *outSize);
+void SZ_compress_args_uint8_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint8_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint8_t minValue);
+TightDataPointStorageI* SZ_compress_uint8_2D_MDQ(uint8_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint8_3D_MDQ(uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint8_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+TightDataPointStorageI* SZ_compress_uint8_4D_MDQ(uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint8_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue);
+void SZ_compress_args_uint8_withinRange(unsigned char** newByteData, uint8_t *oriData, size_t dataLength, size_t *outSize);
+
+int SZ_compress_args_uint8_wRngeNoGzip(unsigned char** newByteData, uint8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+int SZ_compress_args_uint8(unsigned char** newByteData, uint8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZ_UInt8_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/szd_double.h b/thirdparty/SZ/sz/include/szd_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..daf3622de3f45e8a31c84b2d6191db6acf2a709b
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_double.h
@@ -0,0 +1,34 @@
+/**
+ *  @file szd_double.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_double.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Double_H
+#define _SZD_Double_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageD.h"
+
+void decompressDataSeries_double_1D(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_2D(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_3D(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_4D(double** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageD* tdps);
+void getSnapshotData_double_1D(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps, int errBoundMode);
+void getSnapshotData_double_2D(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps, int errBoundMode);
+void getSnapshotData_double_3D(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps, int errBoundMode);
+void getSnapshotData_double_4D(double** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageD* tdps, int errBoundMode);
+
+int SZ_decompress_args_double(double** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Double_H  ----- */
diff --git a/thirdparty/SZ/sz/include/szd_double_pwr.h b/thirdparty/SZ/sz/include/szd_double_pwr.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d3b257b93bd3e843d637d4f84fa9da9ee3a2a1b
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_double_pwr.h
@@ -0,0 +1,28 @@
+/**
+ *  @file szd_double_pwr.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_double_pwr.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Double_PWR_H
+#define _SZD_Double_PWR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void decompressDataSeries_double_1D_pwr(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+double* extractRealPrecision_2D_double(size_t R1, size_t R2, int blockSize, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_2D_pwr(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps);
+double* extractRealPrecision_3D_double(size_t R1, size_t R2, size_t R3, int blockSize, TightDataPointStorageD* tdps);
+void decompressDataSeries_double_3D_pwr(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps);
+
+void decompressDataSeries_double_1D_pwrgroup(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Double_PWR_H  ----- */
diff --git a/thirdparty/SZ/sz/include/szd_double_ts.h b/thirdparty/SZ/sz/include/szd_double_ts.h
new file mode 100644
index 0000000000000000000000000000000000000000..df7c2c98e9738ef2ed020b1c5ab164e9a0be1073
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_double_ts.h
@@ -0,0 +1,25 @@
+/**
+ *  @file szd_double_ts.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_double_ts.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Double_TS_H
+#define _SZD_Double_TS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageD.h"
+
+void decompressDataSeries_double_1D_ts(double** data, size_t dataSeriesLength, sz_multisteps* multisteps, TightDataPointStorageD* tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Double_TS_H  ----- */
diff --git a/thirdparty/SZ/sz/include/szd_float.h b/thirdparty/SZ/sz/include/szd_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..8aaf42c95e5285f46d093fc66db09a3afb4afabf
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_float.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_float.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_float.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Float_H
+#define _SZD_Float_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageF.h"
+
+void decompressDataSeries_float_1D(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_2D(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_3D(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_4D(float** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageF* tdps);
+void getSnapshotData_float_1D(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps, int errBoundMode);
+void getSnapshotData_float_2D(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps, int errBoundMode);
+void getSnapshotData_float_3D(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps, int errBoundMode);
+void getSnapshotData_float_4D(float** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageF* tdps, int errBoundMode);
+
+size_t decompressDataSeries_float_1D_RA_block(float * data, float mean, size_t dim_0, size_t block_dim_0, double realPrecision, int * type, float * unpredictable_data);
+size_t decompressDataSeries_float_2D_RA_block(float * data, float mean, size_t dim_0, size_t dim_1, size_t block_dim_0, size_t block_dim_1, double realPrecision, int * type, float * unpredictable_data);
+
+int SZ_decompress_args_float(float** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+size_t decompressDataSeries_float_3D_RA_block(float * data, float mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, int * type, float * unpredictable_data);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Float_H  ----- */
diff --git a/thirdparty/SZ/sz/include/szd_float_pwr.h b/thirdparty/SZ/sz/include/szd_float_pwr.h
new file mode 100644
index 0000000000000000000000000000000000000000..0907517f732a3a20dd180d69d6f9818a6e64bb07
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_float_pwr.h
@@ -0,0 +1,31 @@
+/**
+ *  @file szd_float_pwr.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_float_pwr.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Float_PWR_H
+#define _SZD_Float_PWR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void decompressDataSeries_float_1D_pwr(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+float* extractRealPrecision_2D_float(size_t R1, size_t R2, int blockSize, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_2D_pwr(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps);
+float* extractRealPrecision_3D_float(size_t R1, size_t R2, size_t R3, int blockSize, TightDataPointStorageF* tdps);
+void decompressDataSeries_float_3D_pwr(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps);
+
+char* decompressGroupIDArray(unsigned char* bytes, size_t dataLength);
+void decompressDataSeries_float_1D_pwrgroup(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Float_PWR_H  ----- */
+
diff --git a/thirdparty/SZ/sz/include/szd_float_ts.h b/thirdparty/SZ/sz/include/szd_float_ts.h
new file mode 100644
index 0000000000000000000000000000000000000000..52ed4b34ebd0a33c5f73d7936f92351099b3edaa
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_float_ts.h
@@ -0,0 +1,25 @@
+/**
+ *  @file szd_float_ts.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_float_ts.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Float_TS_H
+#define _SZD_Float_TS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageF.h"
+
+void decompressDataSeries_float_1D_ts(float** data, size_t dataSeriesLength, sz_multisteps* multisteps, TightDataPointStorageF* tdps);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Float_TS_H  ----- */
diff --git a/thirdparty/SZ/sz/include/szd_int16.h b/thirdparty/SZ/sz/include/szd_int16.h
new file mode 100644
index 0000000000000000000000000000000000000000..a55a3d0487bfb45e6efa553e2dacbcf0a89d9ae2
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_int16.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_int16.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_int16.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Int16_H
+#define _SZD_Int16_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_INT16_MIN -32768
+#define SZ_INT16_MAX 32767
+
+void decompressDataSeries_int16_1D(int16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_int16_2D(int16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_int16_3D(int16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_int16_4D(int16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_int16_1D(int16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int16_2D(int16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int16_3D(int16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int16_4D(int16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_int16(int16_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int16_H  ----- */
diff --git a/thirdparty/SZ/sz/include/szd_int32.h b/thirdparty/SZ/sz/include/szd_int32.h
new file mode 100644
index 0000000000000000000000000000000000000000..233901f54e88b1d13586d2533fc16775f9d9f17a
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_int32.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_int32.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_int32.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Int32_H
+#define _SZD_Int32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_INT32_MIN -2147483648
+#define SZ_INT32_MAX 2147483647
+
+void decompressDataSeries_int32_1D(int32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_int32_2D(int32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_int32_3D(int32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_int32_4D(int32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_int32_1D(int32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int32_2D(int32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int32_3D(int32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int32_4D(int32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_int32(int32_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int32_H  ----- */
diff --git a/thirdparty/SZ/sz/include/szd_int64.h b/thirdparty/SZ/sz/include/szd_int64.h
new file mode 100644
index 0000000000000000000000000000000000000000..5dcb97ac9be5bf6f544f29455189cd50ca878c25
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_int64.h
@@ -0,0 +1,35 @@
+/**
+ *  @file szd_int64.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_int64.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Int64_H
+#define _SZD_Int64_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+void decompressDataSeries_int64_1D(int64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_int64_2D(int64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_int64_3D(int64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_int64_4D(int64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_int64_1D(int64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int64_2D(int64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int64_3D(int64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int64_4D(int64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_int64(int64_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int64_H  ----- */
diff --git a/thirdparty/SZ/sz/include/szd_int8.h b/thirdparty/SZ/sz/include/szd_int8.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6186f866d008fa27e2978c89b4d207cd7426a2a
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_int8.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_int8.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_int8.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_Int8_H
+#define _SZD_Int8_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_INT8_MIN -128
+#define SZ_INT8_MAX 127
+
+void decompressDataSeries_int8_1D(int8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_int8_2D(int8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_int8_3D(int8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_int8_4D(int8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_int8_1D(int8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int8_2D(int8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int8_3D(int8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_int8_4D(int8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_int8(int8_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int8_H  ----- */
diff --git a/thirdparty/SZ/sz/include/szd_uint16.h b/thirdparty/SZ/sz/include/szd_uint16.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcd3ed830703818dba7fa5b8b71c84ac448b205a
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_uint16.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_uint16.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_uint16.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_UInt16_H
+#define _SZD_UInt16_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_UINT16_MIN 0
+#define SZ_UINT16_MAX 65535
+
+void decompressDataSeries_uint16_1D(uint16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint16_2D(uint16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint16_3D(uint16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint16_4D(uint16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_uint16_1D(uint16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint16_2D(uint16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint16_3D(uint16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint16_4D(uint16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_uint16(uint16_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_Int16_H  ----- */
diff --git a/thirdparty/SZ/sz/include/szd_uint32.h b/thirdparty/SZ/sz/include/szd_uint32.h
new file mode 100644
index 0000000000000000000000000000000000000000..88ff5708a80aea4f28bfd4819b40a39c3f42e36e
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_uint32.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_uint32.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_uint32.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_UInt32_H
+#define _SZD_UInt32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_UINT32_MIN 0
+#define SZ_UINT32_MAX 4294967295
+
+void decompressDataSeries_uint32_1D(uint32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint32_2D(uint32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint32_3D(uint32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint32_4D(uint32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_uint32_1D(uint32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint32_2D(uint32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint32_3D(uint32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint32_4D(uint32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_uint32(uint32_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_UInt32_H  ----- */
diff --git a/thirdparty/SZ/sz/include/szd_uint64.h b/thirdparty/SZ/sz/include/szd_uint64.h
new file mode 100644
index 0000000000000000000000000000000000000000..6992c68ebceaaeae5be46e6a2228457e1ee85cd2
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_uint64.h
@@ -0,0 +1,35 @@
+/**
+ *  @file szd_uint64.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_uint64.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_UInt64_H
+#define _SZD_UInt64_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+void decompressDataSeries_uint64_1D(uint64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint64_2D(uint64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint64_3D(uint64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint64_4D(uint64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_uint64_1D(uint64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint64_2D(uint64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint64_3D(uint64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint64_4D(uint64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_uint64(uint64_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_UInt64_H  ----- */
diff --git a/thirdparty/SZ/sz/include/szd_uint8.h b/thirdparty/SZ/sz/include/szd_uint8.h
new file mode 100644
index 0000000000000000000000000000000000000000..2366c7e08e0f048c4634f7689e65d3b48fcc7bf2
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szd_uint8.h
@@ -0,0 +1,38 @@
+/**
+ *  @file szd_uint8.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szd_uint8.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZD_UInt8_H
+#define _SZD_UInt8_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "TightDataPointStorageI.h"
+
+#define SZ_UINT8_MIN 0
+#define SZ_UINT8_MAX 255
+
+void decompressDataSeries_uint8_1D(uint8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint8_2D(uint8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint8_3D(uint8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps);
+void decompressDataSeries_uint8_4D(uint8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps);
+
+void getSnapshotData_uint8_1D(uint8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint8_2D(uint8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint8_3D(uint8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode);
+void getSnapshotData_uint8_4D(uint8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode);
+
+int SZ_decompress_args_uint8(uint8_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZD_UInt8_H  ----- */
diff --git a/thirdparty/SZ/sz/include/szf.h b/thirdparty/SZ/sz/include/szf.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcef4eb8f2ca11a650575f4542507ad8762c47bf
--- /dev/null
+++ b/thirdparty/SZ/sz/include/szf.h
@@ -0,0 +1,102 @@
+/**
+ *  @file szf.h
+ *  @author Sheng Di
+ *  @date July, 2017
+ *  @brief Header file for the szf.c.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#ifndef _SZF_H
+#define _SZF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+//szf.c
+void sz_init_c_(char *configFile,int *len,int *ierr);
+void sz_finalize_c_();
+void SZ_writeData_inBinary_d1_Float_(float* data, char *fileName, int *len);
+void sz_compress_d1_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1);
+void sz_compress_d1_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1);
+void sz_compress_d2_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2);
+void sz_compress_d2_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2);
+void sz_compress_d3_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d3_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d4_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_compress_d5_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_compress_d1_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1);
+void sz_compress_d1_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1);
+void sz_compress_d2_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2);
+void sz_compress_d2_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2);
+void sz_compress_d3_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d3_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d4_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_compress_d5_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_compress_d1_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1);
+void sz_compress_d2_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2);
+void sz_compress_d3_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_compress_d1_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1);
+void sz_compress_d2_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2);
+void sz_compress_d3_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_compress_d1_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1);
+void sz_compress_d2_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2);
+void sz_compress_d3_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_compress_d1_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1);
+void sz_compress_d2_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2);
+void sz_compress_d3_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_compress_d4_double_rev_args_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_compress_d5_double_rev_args_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_decompress_d1_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1);
+void sz_decompress_d2_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2);
+void sz_decompress_d3_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3);
+void sz_decompress_d4_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_decompress_d5_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_decompress_d1_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1);
+void sz_decompress_d2_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2);
+void sz_decompress_d3_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3);
+void sz_decompress_d4_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_decompress_d5_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+
+void sz_batchaddVar_d1_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1);
+void sz_batchaddvar_d2_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2);
+void sz_batchaddvar_d3_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_batchaddvar_d4_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_batchaddvar_d5_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_batchaddvar_d1_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1);
+void sz_batchaddvar_d2_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2);
+void sz_batchaddvar_d3_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3);
+void sz_batchaddvar_d4_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4);
+void sz_batchaddvar_d5_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void sz_batchdelvar_c_(char* varName, int *len, int *errState);
+void sz_batch_compress_c_(unsigned char* bytes, size_t *outSize);
+void sz_batch_decompress_c_(unsigned char* bytes, size_t *byteLength, int *ierr);
+void sz_getvardim_c_(char* varName, int *len, int *dim, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5);
+void compute_total_batch_size_c_(size_t *totalSize);
+void sz_getvardata_float_(char* varName, int *len, float* data);
+void sz_getvardata_double_(char* varName, int *len, double* data);
+void sz_freevarset_c_(int *mode);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ----- #ifndef _SZF_H  ----- */
+
diff --git a/thirdparty/SZ/sz/src/ByteToolkit.c b/thirdparty/SZ/sz/src/ByteToolkit.c
new file mode 100644
index 0000000000000000000000000000000000000000..31dbf66f9b1fba6ff623adccf0f47ad4401e5a5c
--- /dev/null
+++ b/thirdparty/SZ/sz/src/ByteToolkit.c
@@ -0,0 +1,999 @@
+/**
+ *  @file ByteToolkit.c
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Byte Toolkit
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+ 
+#include <stdlib.h>
+#include "sz.h" 	
+#include "zlib.h"
+
+inline unsigned short bytesToUInt16_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	unsigned short res = 0;
+	
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+	
+	return res;
+}	
+	
+inline unsigned int bytesToUInt32_bigEndian(unsigned char* bytes)
+{
+	unsigned int temp = 0;
+	unsigned int res = 0;
+	
+	res <<= 8;
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = bytes[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = bytes[3] & 0xff;
+	res |= temp;
+	
+	return res;
+}
+
+inline unsigned long bytesToUInt64_bigEndian(unsigned char* b) {
+	unsigned long temp = 0;
+	unsigned long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+	
+inline short bytesToInt16_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	short res = 0;
+	
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+	
+	return res;
+}	
+	
+inline int bytesToInt32_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	int res = 0;
+	
+	res <<= 8;
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = bytes[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = bytes[3] & 0xff;
+	res |= temp;
+	
+	return res;
+}
+
+inline long bytesToInt64_bigEndian(unsigned char* b) {
+	long temp = 0;
+	long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+
+inline int bytesToInt_bigEndian(unsigned char* bytes)
+{
+	int temp = 0;
+	int res = 0;
+	
+	res <<= 8;
+	temp = bytes[0] & 0xff;
+	res |= temp;	
+
+	res <<= 8;
+	temp = bytes[1] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = bytes[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = bytes[3] & 0xff;
+	res |= temp;
+	
+	return res;
+}
+
+/**
+ * @unsigned char *b the variable to store the converted bytes (length=4)
+ * @unsigned int num
+ * */
+inline void intToBytes_bigEndian(unsigned char *b, unsigned int num)
+{
+	b[0] = (unsigned char)(num >> 24);	
+	b[1] = (unsigned char)(num >> 16);	
+	b[2] = (unsigned char)(num >> 8);	
+	b[3] = (unsigned char)(num);	
+	
+	//note: num >> xxx already considered endian_type...
+//if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_4bytes(*b); //change to BIG_ENDIAN_DATA
+}
+
+inline void int64ToBytes_bigEndian(unsigned char *b, uint64_t num)
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+}
+
+inline void int32ToBytes_bigEndian(unsigned char *b, uint32_t num)
+{
+	b[0] = (unsigned char)(num >> 24);	
+	b[1] = (unsigned char)(num >> 16);	
+	b[2] = (unsigned char)(num >> 8);	
+	b[3] = (unsigned char)(num);		
+}
+
+inline void int16ToBytes_bigEndian(unsigned char *b, uint16_t num)
+{
+	b[0] = (unsigned char)(num >> 8);	
+	b[1] = (unsigned char)(num);
+}
+
+/**
+ * @endianType: refers to the endian_type of unsigned char* b.
+ * */
+inline long bytesToLong_bigEndian(unsigned char* b) {
+	long temp = 0;
+	long res = 0;
+
+	res <<= 8;
+	temp = b[0] & 0xff;
+	res |= temp;
+
+	res <<= 8;
+	temp = b[1] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[2] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[3] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[4] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[5] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[6] & 0xff;
+	res |= temp;
+	
+	res <<= 8;
+	temp = b[7] & 0xff;
+	res |= temp;						
+	
+	return res;
+}
+
+inline void longToBytes_bigEndian(unsigned char *b, unsigned long num) 
+{
+	b[0] = (unsigned char)(num>>56);
+	b[1] = (unsigned char)(num>>48);
+	b[2] = (unsigned char)(num>>40);
+	b[3] = (unsigned char)(num>>32);
+	b[4] = (unsigned char)(num>>24);
+	b[5] = (unsigned char)(num>>16);
+	b[6] = (unsigned char)(num>>8);
+	b[7] = (unsigned char)(num);
+//	if(dataEndianType==LITTLE_ENDIAN_DATA)
+//		symTransform_8bytes(*b);
+}
+
+
+inline long doubleToOSEndianLong(double value)
+{
+	ldouble buf;
+	buf.value = value;
+	return buf.lvalue;
+}
+
+inline int floatToOSEndianInt(float value)
+{
+	lfloat buf;
+	buf.value = value;
+	return buf.ivalue;
+}
+
+//TODO: debug: lfBuf.lvalue could be actually little_endian....
+inline short getExponent_float(float value)
+{
+	//int ivalue = floatToBigEndianInt(value);
+
+	lfloat lbuf;
+	lbuf.value = value;
+	int ivalue = lbuf.ivalue;
+	
+	int expValue = (ivalue & 0x7F800000) >> 23;
+	expValue -= 127;
+	return (short)expValue;
+}
+
+inline short getPrecisionReqLength_float(float precision)
+{
+	lfloat lbuf;
+	lbuf.value = precision;
+	int ivalue = lbuf.ivalue;
+	
+	int expValue = (ivalue & 0x7F800000) >> 23;
+	expValue -= 127;
+//	unsigned char the1stManBit = (unsigned char)((ivalue & 0x00400000) >> 22);
+//	if(the1stManBit==1)
+//		expValue--;	
+	return (short)expValue;
+}
+
+inline short getExponent_double(double value)
+{
+	//long lvalue = doubleToBigEndianLong(value);
+	
+	ldouble lbuf;
+	lbuf.value = value;
+	long lvalue = lbuf.lvalue;
+	
+	int expValue = (int)((lvalue & 0x7FF0000000000000) >> 52);
+	expValue -= 1023;
+	return (short)expValue;
+}
+
+short getPrecisionReqLength_double(double precision)
+{
+	ldouble lbuf;
+	lbuf.value = precision;
+	long lvalue = lbuf.lvalue;
+	
+	int expValue = (int)((lvalue & 0x7FF0000000000000) >> 52);
+	expValue -= 1023;
+//	unsigned char the1stManBit = (unsigned char)((lvalue & 0x0008000000000000) >> 51);
+//	if(the1stManBit==1)
+//		expValue--;
+	return (short)expValue;
+}
+
+unsigned char numberOfLeadingZeros_Int(int i) {
+	if (i == 0)
+		return 32;
+	unsigned char n = 1;
+	if (((unsigned int)i) >> 16 == 0) { n += 16; i <<= 16; }
+	if (((unsigned int)i) >> 24 == 0) { n +=  8; i <<=  8; }
+	if (((unsigned int)i) >> 28 == 0) { n +=  4; i <<=  4; }
+	if (((unsigned int)i) >> 30 == 0) { n +=  2; i <<=  2; }
+	n -= ((unsigned int)i) >> 31;
+	return n;
+}
+
+unsigned char numberOfLeadingZeros_Long(long i) {
+	 if (i == 0)
+		return 64;
+	unsigned char n = 1;
+	int x = (int)(((unsigned long)i) >> 32);
+	if (x == 0) { n += 32; x = (int)i; }
+	if (((unsigned int)x) >> 16 == 0) { n += 16; x <<= 16; }
+	if (((unsigned int)x) >> 24 == 0) { n +=  8; x <<=  8; }
+	if (((unsigned int)x) >> 28 == 0) { n +=  4; x <<=  4; }
+	if (((unsigned int)x) >> 30 == 0) { n +=  2; x <<=  2; }
+	n -= ((unsigned int)x) >> 31;
+	return n;
+}
+
+unsigned char getLeadingNumbers_Int(int v1, int v2)
+{
+	int v = v1 ^ v2;
+	return (unsigned char)numberOfLeadingZeros_Int(v);
+}
+
+unsigned char getLeadingNumbers_Long(long v1, long v2)
+{
+	long v = v1 ^ v2;
+	return (unsigned char)numberOfLeadingZeros_Long(v);
+}
+
+/**
+ * By default, the endian type is OS endian type.
+ * */
+short bytesToShort(unsigned char* bytes)
+{
+	lint16 buf;
+	memcpy(buf.byte, bytes, 2);
+	
+	return buf.svalue;
+}
+
+void shortToBytes(unsigned char* b, short value)
+{
+	lint16 buf;
+	buf.svalue = value;
+	memcpy(b, buf.byte, 2);
+}
+
+int bytesToInt(unsigned char* bytes)
+{
+	lfloat buf;
+	memcpy(buf.byte, bytes, 4);
+	return buf.ivalue;
+}
+
+long bytesToLong(unsigned char* bytes)
+{
+	ldouble buf;
+	memcpy(buf.byte, bytes, 8);
+	return buf.lvalue;
+}
+
+//the byte to input is in the big-endian format
+float bytesToFloat(unsigned char* bytes)
+{
+	lfloat buf;
+	memcpy(buf.byte, bytes, 4);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_4bytes(buf.byte);	
+	return buf.value;
+}
+
+void floatToBytes(unsigned char *b, float num)
+{
+	lfloat buf;
+	buf.value = num;
+	memcpy(b, buf.byte, 4);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_4bytes(b);		
+}
+
+//the byte to input is in the big-endian format
+double bytesToDouble(unsigned char* bytes)
+{
+	ldouble buf;
+	memcpy(buf.byte, bytes, 8);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_8bytes(buf.byte);
+	return buf.value;
+}
+
+void doubleToBytes(unsigned char *b, double num)
+{
+	ldouble buf;
+	buf.value = num;
+	memcpy(b, buf.byte, 8);
+	if(sysEndianType==LITTLE_ENDIAN_SYSTEM)
+		symTransform_8bytes(b);
+}
+
+int extractBytes(unsigned char* byteArray, size_t k, int validLength)
+{
+	size_t outIndex = k/8;
+	int innerIndex = k%8;
+	unsigned char intBytes[4];
+	int length = innerIndex + validLength;
+	int byteNum = 0;
+	if(length%8==0)
+		byteNum = length/8;
+	else
+		byteNum = length/8+1;
+	
+	int i;
+	for(i = 0;i<byteNum;i++)
+		intBytes[exe_params->SZ_SIZE_TYPE-byteNum+i] = byteArray[outIndex+i];
+	int result = bytesToInt_bigEndian(intBytes);
+	int rightMovSteps = innerIndex +(8 - (innerIndex+validLength)%8)%8;
+	result = result << innerIndex;
+	switch(byteNum)
+	{
+	case 1:
+		result = result & 0xff;
+		break;
+	case 2:
+		result = result & 0xffff;
+		break;
+	case 3:
+		result = result & 0xffffff;
+		break;
+	case 4:
+		break;
+	default: 
+		printf("Error: other cases are impossible...\n");
+		exit(0);
+	}
+	result = result >> rightMovSteps;
+	
+	return result;
+}
+
+int getMaskRightCode(int m) {
+	switch (m) {
+	case 1:
+		return 0x01;
+	case 2:
+		return 0x03;
+	case 3:
+		return 0x07;
+	case 4:
+		return 0x0F;
+	case 5:
+		return 0x1F;
+	case 6:
+		return 0x3F;
+	case 7:
+		return 0X7F;
+	case 8:
+		return 0XFF;
+	default:
+		return 0;
+	}
+}
+
+int getLeftMovingCode(int kMod8)
+{
+	return getMaskRightCode(8 - kMod8);
+}
+
+int getRightMovingSteps(int kMod8, int resiBitLength) {
+	return 8 - kMod8 - resiBitLength;
+}
+
+int getRightMovingCode(int kMod8, int resiBitLength)
+{
+	int rightMovingSteps = 8 - kMod8 - resiBitLength;
+	if(rightMovingSteps < 0)
+	{
+		switch(-rightMovingSteps)
+		{
+		case 1:
+			return 0x80;
+		case 2:
+			return 0xC0;
+		case 3:
+			return 0xE0;
+		case 4:
+			return 0xF0;
+		case 5:
+			return 0xF8;
+		case 6:
+			return 0xFC;
+		case 7:
+			return 0XFE;
+		default:
+			return 0;
+		}    		
+	}
+	else //if(rightMovingSteps >= 0)
+	{
+		int a = getMaskRightCode(8 - kMod8);
+		int b = getMaskRightCode(8 - kMod8 - resiBitLength);
+		int c = a - b;
+		return c;
+	}
+}
+
+short* convertByteDataToShortArray(unsigned char* bytes, size_t byteLength)
+{
+	lint16 ls;
+	size_t i, stateLength = byteLength/2;
+	short* states = (short*)malloc(stateLength*sizeof(short));
+	if(sysEndianType==dataEndianType)
+	{	
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2];
+			ls.byte[1] = bytes[i*2+1];
+			states[i] = ls.svalue;
+		}
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2+1];
+			ls.byte[1] = bytes[i*2];
+			states[i] = ls.svalue;
+		}		
+	}
+	return states;
+} 
+
+unsigned short* convertByteDataToUShortArray(unsigned char* bytes, size_t byteLength)
+{
+	lint16 ls;
+	size_t i, stateLength = byteLength/2;
+	unsigned short* states = (unsigned short*)malloc(stateLength*sizeof(unsigned short));
+	if(sysEndianType==dataEndianType)
+	{	
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2];
+			ls.byte[1] = bytes[i*2+1];
+			states[i] = ls.usvalue;
+		}
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.byte[0] = bytes[i*2+1];
+			ls.byte[1] = bytes[i*2];
+			states[i] = ls.usvalue;
+		}		
+	}
+	return states;
+} 
+
+void convertShortArrayToBytes(short* states, size_t stateLength, unsigned char* bytes)
+{
+	lint16 ls;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.svalue = states[i];
+			bytes[i*2] = ls.byte[0];
+			bytes[i*2+1] = ls.byte[1];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.svalue = states[i];
+			bytes[i*2] = ls.byte[1];
+			bytes[i*2+1] = ls.byte[0];
+		}			
+	}
+}
+
+void convertUShortArrayToBytes(unsigned short* states, size_t stateLength, unsigned char* bytes)
+{
+	lint16 ls;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.usvalue = states[i];
+			bytes[i*2] = ls.byte[0];
+			bytes[i*2+1] = ls.byte[1];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			ls.usvalue = states[i];
+			bytes[i*2] = ls.byte[1];
+			bytes[i*2+1] = ls.byte[0];
+		}			
+	}
+}
+
+void convertIntArrayToBytes(int* states, size_t stateLength, unsigned char* bytes)
+{
+	lint32 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.ivalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.ivalue = states[i];
+			bytes[index] = ls.byte[3];
+			bytes[index+1] = ls.byte[2];
+			bytes[index+2] = ls.byte[1];
+			bytes[index+3] = ls.byte[0];
+		}			
+	}
+}
+
+void convertUIntArrayToBytes(unsigned int* states, size_t stateLength, unsigned char* bytes)
+{
+	lint32 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.uivalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 2; //==i*4
+			ls.uivalue = states[i];
+			bytes[index] = ls.byte[3];
+			bytes[index+1] = ls.byte[2];
+			bytes[index+2] = ls.byte[1];
+			bytes[index+3] = ls.byte[0];
+		}			
+	}
+}
+
+void convertLongArrayToBytes(int64_t* states, size_t stateLength, unsigned char* bytes)
+{
+	lint64 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.lvalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+			bytes[index+4] = ls.byte[4];
+			bytes[index+5] = ls.byte[5];
+			bytes[index+6] = ls.byte[6];
+			bytes[index+7] = ls.byte[7];	
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.lvalue = states[i];
+			bytes[index] = ls.byte[7];
+			bytes[index+1] = ls.byte[6];
+			bytes[index+2] = ls.byte[5];
+			bytes[index+3] = ls.byte[4];
+			bytes[index+4] = ls.byte[3];
+			bytes[index+5] = ls.byte[2];
+			bytes[index+6] = ls.byte[1];
+			bytes[index+7] = ls.byte[0];	
+		}			
+	}
+}
+
+void convertULongArrayToBytes(uint64_t* states, size_t stateLength, unsigned char* bytes)
+{
+	lint64 ls;
+	size_t index = 0;
+	size_t i;
+	if(sysEndianType==dataEndianType)
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.ulvalue = states[i];
+			bytes[index] = ls.byte[0];
+			bytes[index+1] = ls.byte[1];
+			bytes[index+2] = ls.byte[2];
+			bytes[index+3] = ls.byte[3];
+			bytes[index+4] = ls.byte[4];
+			bytes[index+5] = ls.byte[5];
+			bytes[index+6] = ls.byte[6];
+			bytes[index+7] = ls.byte[7];			
+		}		
+	}
+	else
+	{
+		for(i=0;i<stateLength;i++)
+		{
+			index = i << 3; //==i*8
+			ls.ulvalue = states[i];
+			bytes[index] = ls.byte[7];
+			bytes[index+1] = ls.byte[6];
+			bytes[index+2] = ls.byte[5];
+			bytes[index+3] = ls.byte[4];
+			bytes[index+4] = ls.byte[3];
+			bytes[index+5] = ls.byte[2];
+			bytes[index+6] = ls.byte[1];
+			bytes[index+7] = ls.byte[0];	
+		}			
+	}
+}
+
+
+size_t bytesToSize(unsigned char* bytes)
+{
+	size_t result = 0;
+	if(exe_params->SZ_SIZE_TYPE==4)	
+		result = bytesToInt_bigEndian(bytes);//4		
+	else
+		result = bytesToLong_bigEndian(bytes);//8	
+	return result;
+}
+
+void sizeToBytes(unsigned char* outBytes, size_t size)
+{
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(outBytes, size);//4
+	else
+		longToBytes_bigEndian(outBytes, size);//8
+}
+
+void convertSZParamsToBytes(sz_params* params, unsigned char* result)
+{
+	//unsigned char* result = (unsigned char*)malloc(16);
+	unsigned char buf;
+	//flag1: exe_params->optQuantMode(1bit), dataEndianType(1bit), sysEndianType(1bit), conf_params->szMode (1bit), conf_params->gzipMode (2bits), pwrType (2bits)
+	buf = exe_params->optQuantMode;
+	buf = (buf << 1) | dataEndianType;
+	buf = (buf << 1) | sysEndianType;
+	buf = (buf << 1) | params->szMode;
+	
+	int tmp = 0;
+	switch(params->gzipMode)
+	{
+	case Z_BEST_SPEED:
+		tmp = 0;
+		break;
+	case Z_DEFAULT_STRATEGY:
+		tmp = 1;
+		break;
+	case Z_BEST_COMPRESSION:
+		tmp = 2;
+		break;
+	}
+	buf = (buf << 2) | tmp;
+	buf = (buf << 2) |  params->pwr_type;
+	result[0] = buf;
+	
+    //sampleDistance; //2 bytes
+    int16ToBytes_bigEndian(&result[1], params->sampleDistance);
+    
+    //conf_params->predThreshold;  // 2 bytes
+    short tmp2 = params->predThreshold * 10000;
+    int16ToBytes_bigEndian(&result[3], tmp2);
+     
+    //errorBoundMode; //4bits(0.5 byte)
+    result[5] = params->errorBoundMode;
+    
+    //data type (float, double, int8, int16, ....) //10 choices, so 4 bits
+    result[5] = (result[5] << 4) | (params->dataType & 0x17);
+     
+    //result[5]: abs_err_bound or psnr //4 bytes
+    //result[9]: rel_bound_ratio or pwr_err_bound//4 bytes 
+    switch(params->errorBoundMode)
+    {
+	case ABS:
+		floatToBytes(&result[6], (float)(params->absErrBound)); //big_endian
+		memset(&result[10], 0, 4);
+		break;
+	case REL:
+		memset(&result[6], 0, 4);
+		floatToBytes(&result[10], (float)(params->relBoundRatio)); //big_endian
+		break;
+	case ABS_AND_REL:
+	case ABS_OR_REL:
+		floatToBytes(&result[6], (float)(params->absErrBound));
+		floatToBytes(&result[10], (float)(params->relBoundRatio)); //big_endian
+		break;
+	case PSNR:
+		floatToBytes(&result[6], (float)(params->psnr));
+		memset(&result[9], 0, 4);
+		break;
+	case ABS_AND_PW_REL:
+	case ABS_OR_PW_REL:
+		floatToBytes(&result[6], (float)(params->absErrBound));
+		floatToBytes(&result[10], (float)(params->pw_relBoundRatio)); //big_endian	
+		break;
+	case REL_AND_PW_REL:
+	case REL_OR_PW_REL:
+		floatToBytes(&result[6], (float)(params->relBoundRatio));
+		floatToBytes(&result[10], (float)(params->pw_relBoundRatio)); //big_endian	
+		break;
+	case PW_REL:
+		memset(&result[6], 0, 4);
+		floatToBytes(&result[10], (float)(params->pw_relBoundRatio)); //big_endian
+		break;		
+	}
+   
+    //segment_size  // 2 bytes
+    int16ToBytes_bigEndian(&result[14], (short)(params->segment_size));
+    
+    if(exe_params->optQuantMode==1)
+		int32ToBytes_bigEndian(&result[16], params->max_quant_intervals);
+	else
+		int32ToBytes_bigEndian(&result[16], params->quantization_intervals);
+}
+
+sz_params* convertBytesToSZParams(unsigned char* bytes)
+{
+	sz_params* params = (sz_params*)malloc(sizeof(struct sz_params));
+	unsigned char flag1 = bytes[0];
+	exe_params->optQuantMode = flag1 >> 7;
+	dataEndianType = (flag1 & 0x7f) >> 7;
+	sysEndianType = (flag1 & 0x3f) >> 7;
+	
+	params->szMode = (flag1 & 0x1f) >> 7;
+	
+	int tmp = (flag1 & 0x0f) >> 6;
+	switch(tmp)
+	{
+	case 0:
+		params->gzipMode = Z_BEST_SPEED;
+		break;
+	case 1:
+		params->gzipMode = Z_DEFAULT_STRATEGY;
+		break;
+	case 2:
+		params->gzipMode = Z_BEST_COMPRESSION;
+		break;
+	}
+	
+	params->pwr_type = (flag1 & 0x03) >> 6;
+
+	params->sampleDistance = bytesToInt16_bigEndian(&bytes[1]);
+	
+	params->predThreshold = 1.0*bytesToInt16_bigEndian(&bytes[3])/10000.0;
+    
+    params->dataType = bytes[5] & 0x07;
+
+	params->errorBoundMode = (bytes[5] & 0xf0) >> 4;
+
+    switch(params->errorBoundMode)
+    {
+	case ABS:
+		params->absErrBound = bytesToFloat(&bytes[6]);
+		break;
+	case REL:
+		params->relBoundRatio = bytesToFloat(&bytes[10]);
+		break;
+	case ABS_AND_REL:
+	case ABS_OR_REL:
+		params->absErrBound = bytesToFloat(&bytes[6]);
+		params->relBoundRatio = bytesToFloat(&bytes[10]);
+		break;
+	case PSNR:
+		params->psnr = bytesToFloat(&bytes[6]);
+		break;
+	case ABS_AND_PW_REL:
+	case ABS_OR_PW_REL:
+		params->absErrBound = bytesToFloat(&bytes[6]);
+		params->pw_relBoundRatio = bytesToFloat(&bytes[10]);	
+		break;
+	case REL_AND_PW_REL:
+	case REL_OR_PW_REL:
+		params->relBoundRatio = bytesToFloat(&bytes[6]);
+		params->pw_relBoundRatio = bytesToFloat(&bytes[10]);	
+		break;
+	case PW_REL:
+		params->pw_relBoundRatio = bytesToFloat(&bytes[10]);		
+	}
+	
+    //segment_size  // 2 bytes
+    params->segment_size = bytesToInt16_bigEndian(&bytes[14]);	
+    
+    if(exe_params->optQuantMode==1)
+    {
+		params->max_quant_intervals = bytesToInt32_bigEndian(&bytes[16]);
+		params->quantization_intervals = 0;
+	}
+	else
+	{
+		params->max_quant_intervals = 0;
+		params->quantization_intervals = bytesToInt32_bigEndian(&bytes[16]);  
+	}
+	return params;
+}
diff --git a/thirdparty/SZ/sz/src/CompressElement.c b/thirdparty/SZ/sz/src/CompressElement.c
new file mode 100644
index 0000000000000000000000000000000000000000..bc11ec5252f7b697a030d5c28263e2806e6eeb4e
--- /dev/null
+++ b/thirdparty/SZ/sz/src/CompressElement.c
@@ -0,0 +1,248 @@
+/**
+ *  @file CompressElement.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Functions of CompressElement
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wchar-subscripts"
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <math.h>
+#include <sz.h>
+#include <CompressElement.h>
+
+char* decompressGroupIDArray(unsigned char* bytes, size_t dataLength)
+{
+	HuffmanTree* huffmanTree = SZ_Reset(); //create a default huffman tree	
+	int* standGroupID = (int*)malloc(dataLength*sizeof(int));
+	decode_withTree(huffmanTree, bytes, dataLength, standGroupID);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	char* groupID = (char*)malloc(dataLength*sizeof(char));
+	size_t i = 0;
+	int lastGroupIDValue = 0, curStandIDValue = 0, curGroupIDValue = 0;
+	int offset = 2*(GROUP_COUNT + 2);
+	
+	curGroupIDValue = groupID[0] = standGroupID[0] - GROUP_COUNT;
+	lastGroupIDValue = curGroupIDValue;
+	for(i=1;i<dataLength;i++)
+	{
+		curStandIDValue = standGroupID[i];
+		curGroupIDValue = curStandIDValue + lastGroupIDValue - offset;
+		lastGroupIDValue = curGroupIDValue;
+		groupID[i] = curGroupIDValue;
+	}
+	free(standGroupID);
+	
+	return groupID;
+}
+
+inline short computeGroupNum_float(float value)
+{
+	short expo = getExponent_float(value);
+	if(expo < 0)
+		expo = -1;
+	return expo;
+}
+
+inline short computeGroupNum_double(double value)
+{
+	short expo = getExponent_double(value);
+	if(expo < 0)
+		expo = -1;
+	return expo;
+}
+
+/**
+ * Add preceding neighbor values to a buffer.
+ * @param  last3CmprsData buffer
+ * @param  value the value to be added to the buffer
+ * */
+inline void listAdd_double(double last3CmprsData[3], double value)
+{
+	last3CmprsData[2] = last3CmprsData[1];
+	last3CmprsData[1] = last3CmprsData[0];
+	last3CmprsData[0] = value;
+}
+
+inline void listAdd_float(float last3CmprsData[3], float value)
+{
+	last3CmprsData[2] = last3CmprsData[1];
+	last3CmprsData[1] = last3CmprsData[0];
+	last3CmprsData[0] = value;
+}
+
+inline void listAdd_int(int64_t last3CmprsData[3], int64_t value)
+{
+	last3CmprsData[2] = last3CmprsData[1];
+	last3CmprsData[1] = last3CmprsData[0];
+	last3CmprsData[0] = value;
+}
+
+inline void listAdd_float_group(float *groups, int *flags, char groupNum, float oriValue, float decValue, char* curGroupID)
+{
+	if(groupNum>=0)
+	{
+		if(flags[groupNum]==0)
+			flags[groupNum] = 1;
+		groups[groupNum] = decValue;		
+	}
+	else
+	{
+		groups[0] = decValue;
+		flags[0] = 1;		
+	}
+
+	if(oriValue>=0)
+		*curGroupID = groupNum+2; //+[-1,0,1,2,3,....,16] is mapped to [1,2,....,18]
+	else
+		*curGroupID = -(groupNum+2); //-[-1,0,1,2,3,....,16] is mapped to [-1,-2,....,-18]
+}
+
+inline void listAdd_double_group(double *groups, int *flags, char groupNum, double oriValue, double decValue, char* curGroupID)
+{
+	if(groupNum>=0)
+	{
+		if(flags[groupNum]==0)
+			flags[groupNum] = 1;
+		groups[groupNum] = decValue;		
+	}
+	else
+	{
+		groups[0] = decValue;
+		flags[0] = 1;		
+	}
+
+	if(oriValue>=0)
+		*curGroupID = groupNum+2; //+[-1,0,1,2,3,....,16] is mapped to [1,2,....,18]
+	else
+		*curGroupID = -(groupNum+2); //-[-1,0,1,2,3,....,16] is mapped to [-1,-2,....,-18]
+}
+
+/**
+ * Determine whether the prediction value minErr is valid.
+ * 
+ * */
+inline int validPrediction_double(double minErr, double precision)
+{
+	if(minErr<=precision)
+		return 1;
+	else
+		return 0;
+}
+
+inline int validPrediction_float(float minErr, float precision)
+{
+	if(minErr<=precision)
+		return 1;
+	else
+		return 0;
+}
+
+double* generateGroupErrBounds(int errorBoundMode, double realPrecision, double pwrErrBound)
+{
+	double pwrError;
+	double* result = (double*)malloc(GROUP_COUNT*sizeof(double));
+	int i = 0;
+	for(i=0;i<GROUP_COUNT;i++)
+	{
+		pwrError = ((double)pow(2, i))*pwrErrBound;
+		switch(errorBoundMode)
+		{
+		case ABS_AND_PW_REL:
+		case REL_AND_PW_REL: 
+			result[i] = pwrError<realPrecision?pwrError:realPrecision;
+			break;
+		case ABS_OR_PW_REL:
+		case REL_OR_PW_REL:
+			result[i] = pwrError<realPrecision?realPrecision:pwrError;
+			break;
+		case PW_REL:
+			result[i] = pwrError;
+			break;
+		}
+		
+	}
+	return result;
+}
+
+int generateGroupMaxIntervalCount(double* groupErrBounds)
+{
+	int i = 0;
+	int maxCount = 0, count = 0;
+	for(i=0;i<GROUP_COUNT;i++)
+	{
+		count = (int)(pow(2, i)/groupErrBounds[i] + 0.5);
+		if(maxCount<count)
+			maxCount = count;
+	}
+	
+	return maxCount;
+}
+
+void new_LossyCompressionElement(LossyCompressionElement *lce, int leadingNum, unsigned char* intMidBytes, 
+int intMidBytes_Length, int resiMidBitsLength, int resiBits)
+{
+	lce->leadingZeroBytes = leadingNum; //0,1,2,or 3
+	memcpy(lce->integerMidBytes,intMidBytes,intMidBytes_Length);
+	lce->integerMidBytes_Length = intMidBytes_Length; //they are mid_bits actually
+	lce->resMidBitsLength = resiMidBitsLength;
+	lce->residualMidBits = resiBits;
+}
+
+void updateLossyCompElement_Double(unsigned char* curBytes, unsigned char* preBytes, 
+		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce)
+{
+	int resiIndex, intMidBytes_Length = 0;
+	int leadingNum = compIdenticalLeadingBytesCount_double(preBytes, curBytes); //in fact, float is enough for both single-precision and double-precisiond ata.
+	int fromByteIndex = leadingNum;
+	int toByteIndex = reqBytesLength; //later on: should use "< toByteIndex" to tarverse....
+	if(fromByteIndex < toByteIndex)
+	{
+		intMidBytes_Length = reqBytesLength - leadingNum;
+		memcpy(lce->integerMidBytes, &(curBytes[fromByteIndex]), intMidBytes_Length);
+	}
+	int resiBits = 0;
+	if(resiBitsLength!=0)
+	{
+		resiIndex = reqBytesLength;
+		if(resiIndex < 8)
+			resiBits = (curBytes[resiIndex] & 0xFF) >> (8-resiBitsLength);
+	}
+	lce->leadingZeroBytes = leadingNum;
+	lce->integerMidBytes_Length = intMidBytes_Length;
+	lce->resMidBitsLength = resiBitsLength;
+	lce->residualMidBits = resiBits;
+}
+
+void updateLossyCompElement_Float(unsigned char* curBytes, unsigned char* preBytes, 
+		int reqBytesLength, int resiBitsLength,  LossyCompressionElement *lce)
+{
+	int resiIndex, intMidBytes_Length = 0;
+	int leadingNum = compIdenticalLeadingBytesCount_float(preBytes, curBytes); //in fact, float is enough for both single-precision and double-precisiond ata.
+	int fromByteIndex = leadingNum;
+	int toByteIndex = reqBytesLength; //later on: should use "< toByteIndex" to tarverse....
+	if(fromByteIndex < toByteIndex)
+	{
+		intMidBytes_Length = reqBytesLength - leadingNum;
+		memcpy(lce->integerMidBytes, &(curBytes[fromByteIndex]), intMidBytes_Length);
+	}
+	int resiBits = 0;
+	if(resiBitsLength!=0)
+	{
+		resiIndex = reqBytesLength;
+		if(resiIndex < 8)
+			resiBits = (curBytes[resiIndex] & 0xFF) >> (8-resiBitsLength);
+	}
+	lce->leadingZeroBytes = leadingNum;
+	lce->integerMidBytes_Length = intMidBytes_Length;
+	lce->resMidBitsLength = resiBitsLength;
+	lce->residualMidBits = resiBits;
+}
+
+#pragma GCC diagnostic pop
diff --git a/thirdparty/SZ/sz/src/DynamicByteArray.c b/thirdparty/SZ/sz/src/DynamicByteArray.c
new file mode 100644
index 0000000000000000000000000000000000000000..a5743486574f2f30ae521a59acdb895e5f4d6d0e
--- /dev/null
+++ b/thirdparty/SZ/sz/src/DynamicByteArray.c
@@ -0,0 +1,68 @@
+/**
+ *  @file DynamicByteArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Byte Array
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicByteArray.h"
+
+void new_DBA(DynamicByteArray **dba, size_t cap) {
+		*dba = (DynamicByteArray *)malloc(sizeof(DynamicByteArray));
+        (*dba)->size = 0;
+        (*dba)->capacity = cap;
+        (*dba)->array = (unsigned char*)malloc(sizeof(unsigned char)*cap);
+    }
+
+void convertDBAtoBytes(DynamicByteArray *dba, unsigned char** bytes)
+{
+	size_t size = dba->size;
+	if(size>0)
+		*bytes = (unsigned char*)malloc(size * sizeof(unsigned char));
+	else
+		*bytes = NULL;
+	memcpy(*bytes, dba->array, size*sizeof(unsigned char));	
+}
+
+void free_DBA(DynamicByteArray *dba)
+{
+	free(dba->array);
+	free(dba);
+}
+
+unsigned char getDBA_Data(DynamicByteArray *dba, size_t pos)
+{
+	if(pos>=dba->size)
+	{
+		printf("Error: wrong position of DBA (impossible case unless bugs elsewhere in the code?).\n");
+		exit(0);
+	}
+	return dba->array[pos];
+}
+
+void addDBA_Data(DynamicByteArray *dba, unsigned char value)
+{
+	if(dba->size==dba->capacity)
+	{
+		dba->capacity = dba->capacity << 1;
+		dba->array = (unsigned char *)realloc(dba->array, dba->capacity*sizeof(unsigned char));
+	}
+	dba->array[dba->size] = value;
+	dba->size ++;
+}
+
+void memcpyDBA_Data(DynamicByteArray *dba, unsigned char* data, size_t length)
+{
+	if(dba->size + length > dba->capacity)
+	{
+		dba->capacity = dba->size + length;
+		dba->array = (unsigned char *)realloc(dba->array, dba->capacity*sizeof(unsigned char));
+	}
+	memcpy(&(dba->array[dba->size]), data, length);
+	dba->size += length;
+}
diff --git a/thirdparty/SZ/sz/src/DynamicDoubleArray.c b/thirdparty/SZ/sz/src/DynamicDoubleArray.c
new file mode 100644
index 0000000000000000000000000000000000000000..20eb579d65d5462ebf9e80b8f8ee21ecca083883
--- /dev/null
+++ b/thirdparty/SZ/sz/src/DynamicDoubleArray.c
@@ -0,0 +1,57 @@
+/**
+ *  @file DynamicFloatArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Float Array
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicDoubleArray.h"
+
+void new_DDA(DynamicDoubleArray **dda, size_t cap) {
+		*dda = (DynamicDoubleArray *)malloc(sizeof(DynamicDoubleArray));
+        (*dda)->size = 0;
+        (*dda)->capacity = cap;
+        (*dda)->array = (double*)malloc(sizeof(double)*cap);
+    }
+
+void convertDDAtoDoubles(DynamicDoubleArray *dba, double **data)
+{
+	int size = dba->size;
+	if(size>0)
+		*data = (double*)malloc(size * sizeof(double));
+	else
+		*data = NULL;
+	memcpy(*data, dba->array, size*sizeof(double));	
+}
+
+void free_DDA(DynamicDoubleArray *dda)
+{
+	free(dda->array);
+	free(dda);
+}
+
+double getDDA_Data(DynamicDoubleArray *dda, size_t pos)
+{
+	if(pos>=dda->size)
+	{
+		printf("Error: wrong position of DIA.\n");
+		exit(0);
+	}
+	return dda->array[pos];
+}
+
+void addDDA_Data(DynamicDoubleArray *dda, double value)
+{
+	if(dda->size==dda->capacity)
+	{
+		dda->capacity *= 2;
+		dda->array = (double *)realloc(dda->array, dda->capacity*sizeof(double));
+	}
+	dda->array[dda->size] = value;
+	dda->size ++;
+}
diff --git a/thirdparty/SZ/sz/src/DynamicFloatArray.c b/thirdparty/SZ/sz/src/DynamicFloatArray.c
new file mode 100644
index 0000000000000000000000000000000000000000..f775827a83610246c841cc0a284b35818ef7b525
--- /dev/null
+++ b/thirdparty/SZ/sz/src/DynamicFloatArray.c
@@ -0,0 +1,57 @@
+/**
+ *  @file DynamicFloatArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Float Array
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicFloatArray.h"
+
+void new_DFA(DynamicFloatArray **dfa, size_t cap) {
+		*dfa = (DynamicFloatArray *)malloc(sizeof(DynamicFloatArray));
+        (*dfa)->size = 0;
+        (*dfa)->capacity = cap;
+        (*dfa)->array = (float*)malloc(sizeof(float)*cap);
+    }
+
+void convertDFAtoFloats(DynamicFloatArray *dfa, float **data)
+{
+	int size = dfa->size;
+	if(size>0)
+		*data = (float*)malloc(size * sizeof(float));
+	else
+		*data = NULL;
+	memcpy(*data, dfa->array, size*sizeof(float));	
+}
+
+void free_DFA(DynamicFloatArray *dfa)
+{
+	free(dfa->array);
+	free(dfa);
+}
+
+float getDFA_Data(DynamicFloatArray *dfa, size_t pos)
+{
+	if(pos>=dfa->size)
+	{
+		printf("Error: wrong position of DIA.\n");
+		exit(0);
+	}
+	return dfa->array[pos];
+}
+
+void addDFA_Data(DynamicFloatArray *dfa, float value)
+{
+	if(dfa->size==dfa->capacity)
+	{
+		dfa->capacity *= 2;
+		dfa->array = (float *)realloc(dfa->array, dfa->capacity*sizeof(float));
+	}
+	dfa->array[dfa->size] = value;
+	dfa->size++;
+}
diff --git a/thirdparty/SZ/sz/src/DynamicIntArray.c b/thirdparty/SZ/sz/src/DynamicIntArray.c
new file mode 100644
index 0000000000000000000000000000000000000000..3196ab99134e632a855183cd982e31d4004785b8
--- /dev/null
+++ b/thirdparty/SZ/sz/src/DynamicIntArray.c
@@ -0,0 +1,57 @@
+/**
+ *  @file DynamicIntArray.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief Dynamic Int Array
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "DynamicIntArray.h"
+
+void new_DIA(DynamicIntArray **dia, size_t cap) {
+		*dia = (DynamicIntArray *)malloc(sizeof(DynamicIntArray));
+        (*dia)->size = 0;
+        (*dia)->capacity = cap;
+        (*dia)->array = (unsigned char*)malloc(sizeof(unsigned char)*cap);
+    }
+
+void convertDIAtoInts(DynamicIntArray *dia, unsigned char **data)
+{
+	int size = dia->size;
+	if(size>0)
+		*data = (unsigned char*)malloc(size * sizeof(char));
+	else
+		*data = NULL;
+	memcpy(*data, dia->array, size*sizeof(unsigned char));	
+}
+
+void free_DIA(DynamicIntArray *dia)
+{
+	free(dia->array);
+	free(dia);
+}
+
+int getDIA_Data(DynamicIntArray *dia, size_t pos)
+{
+	if(pos>=dia->size)
+	{
+		printf("Error: wrong position of DIA.\n");
+		exit(0);
+	}
+	return dia->array[pos];
+}
+
+void addDIA_Data(DynamicIntArray *dia, int value)
+{
+	if(dia->size==dia->capacity)
+	{
+		dia->capacity = dia->capacity << 1;
+		dia->array = (unsigned char *)realloc(dia->array, dia->capacity*sizeof(unsigned char));
+	}
+	dia->array[dia->size] = (unsigned char)value;
+	dia->size ++;
+}
diff --git a/thirdparty/SZ/sz/src/Huffman.c b/thirdparty/SZ/sz/src/Huffman.c
new file mode 100644
index 0000000000000000000000000000000000000000..d067609063c54bb297fbfac0df04280ccb43f4b5
--- /dev/null
+++ b/thirdparty/SZ/sz/src/Huffman.c
@@ -0,0 +1,725 @@
+/**
+ *  @file Huffman.c
+ *  @author Sheng Di
+ *  @date Aug., 2016
+ *  @brief Customized Huffman Encoding, Compression and Decompression functions
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "Huffman.h"
+#include "sz.h"
+
+
+HuffmanTree* createHuffmanTree(int stateNum)
+{			
+	HuffmanTree *huffmanTree = (HuffmanTree*)malloc(sizeof(HuffmanTree));
+	memset(huffmanTree, 0, sizeof(HuffmanTree));
+	huffmanTree->stateNum = stateNum;
+	huffmanTree->allNodes = 2*stateNum;
+	
+	huffmanTree->pool = (struct node_t*)malloc(huffmanTree->allNodes*2*sizeof(struct node_t));
+	huffmanTree->qqq = (node*)malloc(huffmanTree->allNodes*2*sizeof(node));
+	huffmanTree->code = (unsigned long**)malloc(huffmanTree->stateNum*sizeof(unsigned long*));
+	huffmanTree->cout = (unsigned char *)malloc(huffmanTree->stateNum*sizeof(unsigned char));
+	
+	memset(huffmanTree->pool, 0, huffmanTree->allNodes*2*sizeof(struct node_t));
+	memset(huffmanTree->qqq, 0, huffmanTree->allNodes*2*sizeof(node));
+    memset(huffmanTree->code, 0, huffmanTree->stateNum*sizeof(unsigned long*));
+    memset(huffmanTree->cout, 0, huffmanTree->stateNum*sizeof(unsigned char));
+	huffmanTree->qq = huffmanTree->qqq - 1;
+	huffmanTree->n_nodes = 0;
+    huffmanTree->n_inode = 0;
+    huffmanTree->qend = 1;	
+    
+    return huffmanTree;
+}
+
+HuffmanTree* createDefaultHuffmanTree()
+{
+	int maxRangeRadius = 32768;
+	int stateNum = maxRangeRadius << 1; //*2
+
+    return createHuffmanTree(stateNum);
+}
+ 
+node new_node(HuffmanTree* huffmanTree, size_t freq, unsigned int c, node a, node b)
+{
+	node n = huffmanTree->pool + huffmanTree->n_nodes++;
+	if (freq) 
+	{
+		n->c = c;
+		n->freq = freq;
+		n->t = 1;
+	}
+	else {
+		n->left = a; 
+		n->right = b;
+		n->freq = a->freq + b->freq;
+		n->t = 0;
+		//n->c = 0;
+	}
+	return n;
+}
+ 
+node new_node2(HuffmanTree *huffmanTree, unsigned int c, unsigned char t)
+{
+	huffmanTree->pool[huffmanTree->n_nodes].c = c;
+	huffmanTree->pool[huffmanTree->n_nodes].t = t;
+	return huffmanTree->pool + huffmanTree->n_nodes++;
+} 
+ 
+/* priority queue */
+void qinsert(HuffmanTree *huffmanTree, node n)
+{
+	int j, i = huffmanTree->qend++;
+	while ((j = (i>>1)))  //j=i/2
+	{
+		if (huffmanTree->qq[j]->freq <= n->freq) break;
+		huffmanTree->qq[i] = huffmanTree->qq[j], i = j;
+	}
+	huffmanTree->qq[i] = n;
+}
+ 
+node qremove(HuffmanTree* huffmanTree)
+{
+	int i, l;
+	node n = huffmanTree->qq[i = 1];
+ 
+	if (huffmanTree->qend < 2) return 0;
+	huffmanTree->qend --;
+	while ((l = (i<<1)) < huffmanTree->qend)  //l=(i*2)
+	{
+		if (l + 1 < huffmanTree->qend && huffmanTree->qq[l + 1]->freq < huffmanTree->qq[l]->freq) l++;
+		huffmanTree->qq[i] = huffmanTree->qq[l], i = l;
+	}
+	huffmanTree->qq[i] = huffmanTree->qq[huffmanTree->qend];
+	return n;
+}
+ 
+/* walk the tree and put 0s and 1s */
+/**
+ * @out1 should be set to 0.
+ * @out2 should be 0 as well.
+ * @index: the index of the byte
+ * */
+void build_code(HuffmanTree *huffmanTree, node n, int len, unsigned long out1, unsigned long out2)
+{
+	if (n->t) {
+		huffmanTree->code[n->c] = (unsigned long*)malloc(2*sizeof(unsigned long));
+		if(len<=64)
+		{
+			(huffmanTree->code[n->c])[0] = out1 << (64 - len);
+			(huffmanTree->code[n->c])[1] = out2;
+		}
+		else
+		{
+			(huffmanTree->code[n->c])[0] = out1;
+			(huffmanTree->code[n->c])[1] = out2 << (128 - len);
+		}
+		huffmanTree->cout[n->c] = (unsigned char)len;
+		return;
+	}
+	int index = len >> 6; //=len/64
+	if(index == 0)
+	{
+		out1 = out1 << 1;
+		out1 = out1 | 0;
+		build_code(huffmanTree, n->left, len + 1, out1, 0);
+		out1 = out1 | 1;
+		build_code(huffmanTree, n->right, len + 1, out1, 0);		
+	}
+	else
+	{
+		if(len%64!=0)
+			out2 = out2 << 1;
+		out2 = out2 | 0;
+		build_code(huffmanTree, n->left, len + 1, out1, out2);
+		out2 = out2 | 1;
+		build_code(huffmanTree, n->right, len + 1, out1, out2);	
+	}
+}
+
+void init(HuffmanTree* huffmanTree, int *s, size_t length)
+{
+	size_t i, index;
+	size_t *freq = (size_t *)malloc(huffmanTree->allNodes*sizeof(size_t));
+	memset(freq, 0, huffmanTree->allNodes*sizeof(size_t));
+	for(i = 0;i < length;i++) 
+	{
+		//index = 0;
+		//index = (index | s[i])<<8;
+		//index = index | s[i+1];
+		index = s[i];
+		freq[index]++;
+	}
+ 
+	for (i = 0; i < huffmanTree->allNodes; i++)
+		if (freq[i]) 
+			qinsert(huffmanTree, new_node(huffmanTree, freq[i], i, 0, 0));
+ 
+	while (huffmanTree->qend > 2) 
+		qinsert(huffmanTree, new_node(huffmanTree, 0, 0, qremove(huffmanTree), qremove(huffmanTree)));
+ 
+	build_code(huffmanTree, huffmanTree->qq[1], 0, 0, 0);
+	free(freq);
+}
+ 
+void encode(HuffmanTree *huffmanTree, int *s, size_t length, unsigned char *out, size_t *outSize)
+{
+	size_t i = 0;
+	unsigned char bitSize = 0, byteSize, byteSizep;
+	int state;
+	unsigned char *p = out;
+	int lackBits = 0;
+	//long totalBitSize = 0, maxBitSize = 0, bitSize21 = 0, bitSize32 = 0;
+	for (i = 0;i<length;i++) 
+	{
+		//state = 0;
+		//state = (state | s[i])<<8;
+		//state = state | s[i+1];
+		
+		state = s[i];
+		bitSize = huffmanTree->cout[state];	
+		
+		//printf("%d %d : %d %u\n",i, state, bitSize, (code[state])[0] >> (64-cout[state])); 
+		//debug: compute the average bitSize and the count that is over 32... 	
+		/*if(bitSize>=21)
+			bitSize21++;
+		if(bitSize>=32)
+			bitSize32++;
+		if(maxBitSize<bitSize)
+			maxBitSize = bitSize;
+		totalBitSize+=bitSize;*/
+
+		if(lackBits==0)
+		{
+			byteSize = bitSize%8==0 ? bitSize/8 : bitSize/8+1; //it's equal to the number of bytes involved (for *outSize)
+			byteSizep = bitSize/8; //it's used to move the pointer p for next data
+			if(byteSize<=8)				
+			{
+				longToBytes_bigEndian(p, (huffmanTree->code[state])[0]);
+				p += byteSizep;
+			}
+			else //byteSize>8
+			{
+				longToBytes_bigEndian(p, (huffmanTree->code[state])[0]);
+				p += 8;			
+				longToBytes_bigEndian(p, (huffmanTree->code[state])[1]);
+				p += (byteSizep - 8);		
+			}
+			*outSize += byteSize;
+			lackBits = bitSize%8==0 ? 0 : 8 - bitSize%8;
+		}
+		else
+		{
+			*p = (*p) | (unsigned char)((huffmanTree->code[state])[0] >> (64 - lackBits));			
+			if(lackBits < bitSize)
+			{
+				p++;
+				//(*outSize)++;
+				long newCode = (huffmanTree->code[state])[0] << lackBits;
+				longToBytes_bigEndian(p, newCode);				
+
+				if(bitSize<=64)
+				{
+					bitSize -= lackBits;
+					byteSize = bitSize%8==0 ? bitSize/8 : bitSize/8+1;
+					byteSizep = bitSize/8;
+					p += byteSizep;
+					(*outSize)+=byteSize;
+					lackBits = bitSize%8==0 ? 0 : 8 - bitSize%8;
+				}
+				else //bitSize > 64
+				{
+					byteSizep = 7; //must be 7 bytes, because lackBits!=0
+					p+=byteSizep;
+					(*outSize)+=byteSize;
+					
+					bitSize -= 64;
+					if(lackBits < bitSize)
+					{
+						*p = (*p) | (unsigned char)((huffmanTree->code[state])[0] >> (64 - lackBits));
+						p++;
+						//(*outSize)++;						
+						newCode = (huffmanTree->code[state])[1] << lackBits;
+						longToBytes_bigEndian(p, newCode);
+						bitSize -= lackBits;
+						byteSize = bitSize%8==0 ? bitSize/8 : bitSize/8+1;
+						byteSizep = bitSize/8;
+						p += byteSizep;
+						(*outSize)+=byteSize;
+						lackBits = bitSize%8==0 ? 0 : 8 - bitSize%8;						
+					}
+					else //lackBits >= bitSize
+					{
+						*p = (*p) | (unsigned char)((huffmanTree->code[state])[0] >> (64 - bitSize));
+						lackBits -= bitSize;
+					}		
+				}
+			}
+			else //lackBits >= bitSize
+			{
+				lackBits -= bitSize;
+				if(lackBits==0)
+					p++;
+			}
+		}
+	}
+//	for(i=0;i<stateNum;i++)
+//		if(code[i]!=NULL) free(code[i]);
+	/*printf("max bitsize = %d\n", maxBitSize);
+	printf("bitSize21 ratio = %f\n", ((float)bitSize21)/length);
+	printf("bitSize32 ratio = %f\n", ((float)bitSize32)/length);
+	printf("avg bit size = %f\n", ((float)totalBitSize)/length);*/
+}
+ 
+void decode(unsigned char *s, size_t targetLength, node t, int *out)
+{
+	size_t i = 0, byteIndex = 0, count = 0;
+	int r; 
+	node n = t;
+	
+	if(n->t) //root->t==1 means that all state values are the same (constant)
+	{
+		for(count=0;count<targetLength;count++)
+			out[count] = n->c;
+		return;
+	}
+	
+	for(i=0;count<targetLength;i++)
+	{
+		
+		byteIndex = i>>3; //i/8
+		r = i%8;
+		if(((s[byteIndex] >> (7-r)) & 0x01) == 0)
+			n = n->left;
+		else
+			n = n->right;
+
+		if (n->t) {
+			//putchar(n->c); 
+			out[count] = n->c;
+			n = t; 
+			count++;
+		}
+	}
+//	putchar('\n');
+	if (t != n) printf("garbage input\n");
+	return;
+} 
+	 
+void pad_tree_uchar(HuffmanTree* huffmanTree, unsigned char* L, unsigned char* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	C[i] = root->c;
+	t[i] = root->t;
+	node lroot = root->left;
+	if(lroot!=0)
+	{
+		huffmanTree->n_inode++;
+		L[i] = huffmanTree->n_inode;
+		pad_tree_uchar(huffmanTree, L,R,C,t, huffmanTree->n_inode, lroot);
+	}
+	node rroot = root->right;
+	if(rroot!=0)
+	{
+		huffmanTree->n_inode++;
+		R[i] = huffmanTree->n_inode;
+		pad_tree_uchar(huffmanTree, L,R,C,t, huffmanTree->n_inode, rroot);
+	}
+}  
+
+void pad_tree_ushort(HuffmanTree* huffmanTree, unsigned short* L, unsigned short* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	C[i] = root->c;
+	t[i] = root->t;
+	node lroot = root->left;
+	if(lroot!=0)
+	{
+		huffmanTree->n_inode++;
+		L[i] = huffmanTree->n_inode;
+		pad_tree_ushort(huffmanTree,L,R,C,t,huffmanTree->n_inode, lroot);
+	}
+	node rroot = root->right;
+	if(rroot!=0)
+	{
+		huffmanTree->n_inode++;
+		R[i] = huffmanTree->n_inode;
+		pad_tree_ushort(huffmanTree,L,R,C,t,huffmanTree->n_inode, rroot);
+	}	
+}
+
+void pad_tree_uint(HuffmanTree* huffmanTree, unsigned int* L, unsigned int* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	C[i] = root->c;
+	t[i] = root->t;
+	node lroot = root->left;
+	if(lroot!=0)
+	{
+		huffmanTree->n_inode++;
+		L[i] = huffmanTree->n_inode;
+		pad_tree_uint(huffmanTree,L,R,C,t,huffmanTree->n_inode, lroot);
+	}
+	node rroot = root->right;
+	if(rroot!=0)
+	{
+		huffmanTree->n_inode++;
+		R[i] = huffmanTree->n_inode;
+		pad_tree_uint(huffmanTree,L,R,C,t,huffmanTree->n_inode, rroot);
+	}
+}
+ 
+unsigned int convert_HuffTree_to_bytes_anyStates(HuffmanTree* huffmanTree, int nodeCount, unsigned char** out) 
+{
+	//printf("nodeCount=%d\n", nodeCount);
+	if(nodeCount<=256)
+	{
+		unsigned char* L = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(L, 0, nodeCount*sizeof(unsigned char));
+		unsigned char* R = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(R, 0, nodeCount*sizeof(unsigned char));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(C, 0, nodeCount*sizeof(unsigned int));
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));
+
+		pad_tree_uchar(huffmanTree,L,R,C,t,0,huffmanTree->qq[1]);
+
+		unsigned int totalSize = 1+3*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);	
+		*out = (unsigned char*)malloc(totalSize*sizeof(unsigned char));
+		(*out)[0] = (unsigned char)sysEndianType;
+		memcpy(*out+1, L, nodeCount*sizeof(unsigned char));
+		memcpy((*out)+1+nodeCount*sizeof(unsigned char),R,nodeCount*sizeof(unsigned char));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned char),C,nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int), t, nodeCount*sizeof(unsigned char));
+		free(L);
+		free(R);
+		free(C);
+		free(t);
+		return totalSize;
+
+	}
+	else if(nodeCount<=65536)
+	{
+		unsigned short* L = (unsigned short*)malloc(nodeCount*sizeof(unsigned short));
+		memset(L, 0, nodeCount*sizeof(unsigned short));
+		unsigned short* R = (unsigned short*)malloc(nodeCount*sizeof(unsigned short));
+		memset(R, 0, nodeCount*sizeof(unsigned short));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));	
+		memset(C, 0, nodeCount*sizeof(unsigned int));		
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));		
+		pad_tree_ushort(huffmanTree,L,R,C,t,0,huffmanTree->qq[1]);
+		unsigned int totalSize = 1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned char) + nodeCount*sizeof(unsigned int);
+		*out = (unsigned char*)malloc(totalSize);
+		(*out)[0] = (unsigned char)sysEndianType;		
+		memcpy(*out+1, L, nodeCount*sizeof(unsigned short));
+		memcpy((*out)+1+nodeCount*sizeof(unsigned short),R,nodeCount*sizeof(unsigned short));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned short),C,nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned int),t,nodeCount*sizeof(unsigned char));
+		free(L);
+		free(R);
+		free(C);
+		free(t);		
+		return totalSize;
+	}
+	else //nodeCount>65536
+	{
+		unsigned int* L = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(L, 0, nodeCount*sizeof(unsigned int));
+		unsigned int* R = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(R, 0, nodeCount*sizeof(unsigned int));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));	
+		memset(C, 0, nodeCount*sizeof(unsigned int));
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));
+		pad_tree_uint(huffmanTree, L,R,C,t,0,huffmanTree->qq[1]);
+		
+		//debug
+		//node root = new_node2(0,0);
+		//unpad_tree_uint(L,R,C,t,0,root);		
+		
+		unsigned int totalSize = 1+3*nodeCount*sizeof(unsigned int)+nodeCount*sizeof(unsigned char);
+		*out = (unsigned char*)malloc(totalSize);
+		(*out)[0] = (unsigned char)sysEndianType;
+		memcpy(*out+1, L, nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+nodeCount*sizeof(unsigned int),R,nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+2*nodeCount*sizeof(unsigned int),C,nodeCount*sizeof(unsigned int));
+		memcpy((*out)+1+3*nodeCount*sizeof(unsigned int),t,nodeCount*sizeof(unsigned char));
+		free(L);
+		free(R);
+		free(C);
+		free(t);
+		return totalSize;		
+	}
+}
+
+void unpad_tree_uchar(HuffmanTree* huffmanTree, unsigned char* L, unsigned char* R, unsigned int* C, unsigned char *t, unsigned int i, node root)
+{
+	//root->c = C[i];
+	if(root->t==0)
+	{
+		unsigned char l, r;
+		l = L[i];
+		if(l!=0)
+		{
+			node lroot = new_node2(huffmanTree,C[l],t[l]);
+			root->left = lroot;
+			unpad_tree_uchar(huffmanTree,L,R,C,t,l,lroot);
+		}
+		r = R[i];
+		if(r!=0)
+		{
+			node rroot = new_node2(huffmanTree,C[r],t[r]);
+			root->right = rroot;
+			unpad_tree_uchar(huffmanTree,L,R,C,t,r,rroot);
+		}
+	}
+}
+
+void unpad_tree_ushort(HuffmanTree* huffmanTree, unsigned short* L, unsigned short* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	//root->c = C[i];
+	if(root->t==0)
+	{
+		unsigned short l, r;
+		l = L[i];
+		if(l!=0)
+		{
+			node lroot = new_node2(huffmanTree,C[l],t[l]);
+			root->left = lroot;
+			unpad_tree_ushort(huffmanTree,L,R,C,t,l,lroot);
+		}
+		r = R[i];
+		if(r!=0)
+		{
+			node rroot = new_node2(huffmanTree,C[r],t[r]);
+			root->right = rroot;
+			unpad_tree_ushort(huffmanTree,L,R,C,t,r,rroot);
+		}
+	}
+}
+
+void unpad_tree_uint(HuffmanTree* huffmanTree, unsigned int* L, unsigned int* R, unsigned int* C, unsigned char* t, unsigned int i, node root)
+{
+	//root->c = C[i];
+	if(root->t==0)
+	{
+		unsigned int l, r;
+		l = L[i];
+		if(l!=0)
+		{
+			node lroot = new_node2(huffmanTree,C[l],t[l]);
+			root->left = lroot;
+			unpad_tree_uint(huffmanTree,L,R,C,t,l,lroot);
+		}
+		r = R[i];
+		if(r!=0)
+		{
+			node rroot = new_node2(huffmanTree,C[r],t[r]);
+			root->right = rroot;
+			unpad_tree_uint(huffmanTree,L,R,C,t,r,rroot);
+		}
+	}
+}
+
+node reconstruct_HuffTree_from_bytes_anyStates(HuffmanTree *huffmanTree, unsigned char* bytes, int nodeCount)
+{
+	//printf("nodeCount=%d\n", nodeCount);
+	if(nodeCount<=256)
+	{
+		unsigned char* L = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(L, 0, nodeCount*sizeof(unsigned char));
+		unsigned char* R = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(R, 0, nodeCount*sizeof(unsigned char));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(C, 0, nodeCount*sizeof(unsigned int));
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));
+		unsigned char cmpSysEndianType = bytes[0];
+		if(cmpSysEndianType!=(unsigned char)sysEndianType)
+		{
+			unsigned char* p = (unsigned char*)(bytes+1+2*nodeCount*sizeof(unsigned char));
+			size_t i = 0, size = nodeCount*sizeof(unsigned int);
+			while(1)
+			{
+				symTransform_4bytes(p);
+				i+=sizeof(unsigned int);
+				if(i<size)
+					p+=sizeof(unsigned int);
+				else
+					break;
+			}		
+		}
+		memcpy(L, bytes+1, nodeCount*sizeof(unsigned char));
+		memcpy(R, bytes+1+nodeCount*sizeof(unsigned char), nodeCount*sizeof(unsigned char));
+		memcpy(C, bytes+1+2*nodeCount*sizeof(unsigned char), nodeCount*sizeof(unsigned int));	
+		memcpy(t, bytes+1+2*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned char));
+		node root = new_node2(huffmanTree, C[0],t[0]);
+		unpad_tree_uchar(huffmanTree,L,R,C,t,0,root);
+		free(L);
+		free(R);
+		free(C);
+		free(t);
+		return root;
+	}
+	else if(nodeCount<=65536)
+	{
+		unsigned short* L = (unsigned short*)malloc(nodeCount*sizeof(unsigned short));
+		memset(L, 0, nodeCount*sizeof(unsigned short));
+		unsigned short* R = (unsigned short*)malloc(nodeCount*sizeof(unsigned short));
+		memset(R, 0, nodeCount*sizeof(unsigned short));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));	
+		memset(C, 0, nodeCount*sizeof(unsigned int));		
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));	
+				
+		unsigned char cmpSysEndianType = bytes[0];	
+		if(cmpSysEndianType!=(unsigned char)sysEndianType)
+		{
+			unsigned char* p = (unsigned char*)(bytes+1);
+			size_t i = 0, size = 3*nodeCount*sizeof(unsigned int);
+			while(1)
+			{
+				symTransform_4bytes(p);
+				i+=sizeof(unsigned int);
+				if(i<size)
+					p+=sizeof(unsigned int);
+				else
+					break;
+			}		
+		}
+
+		memcpy(L, bytes+1, nodeCount*sizeof(unsigned short));
+		memcpy(R, bytes+1+nodeCount*sizeof(unsigned short), nodeCount*sizeof(unsigned short));
+		memcpy(C, bytes+1+2*nodeCount*sizeof(unsigned short), nodeCount*sizeof(unsigned int));	
+
+		memcpy(t, bytes+1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned char));	
+
+		node root = new_node2(huffmanTree,0,0);
+		unpad_tree_ushort(huffmanTree,L,R,C,t,0,root);
+		free(L);
+		free(R);
+		free(C);
+		free(t);		
+		return root;				
+	}
+	else //nodeCount>65536
+	{
+		unsigned int* L = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(L, 0, nodeCount*sizeof(unsigned int));
+		unsigned int* R = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));
+		memset(R, 0, nodeCount*sizeof(unsigned int));
+		unsigned int* C = (unsigned int*)malloc(nodeCount*sizeof(unsigned int));	
+		memset(C, 0, nodeCount*sizeof(unsigned int));
+		unsigned char* t = (unsigned char*)malloc(nodeCount*sizeof(unsigned char));
+		memset(t, 0, nodeCount*sizeof(unsigned char));
+		unsigned char cmpSysEndianType = bytes[0];
+		if(cmpSysEndianType!=(unsigned char)sysEndianType)
+		{
+			unsigned char* p = (unsigned char*)(bytes+1);
+			size_t i = 0, size = 3*nodeCount*sizeof(unsigned int);
+			while(1)
+			{
+				symTransform_4bytes(p);
+				i+=sizeof(unsigned int);
+				if(i<size)
+					p+=sizeof(unsigned int);
+				else
+					break;
+			}
+		}
+
+		memcpy(L, bytes+1, nodeCount*sizeof(unsigned int));
+		memcpy(R, bytes+1+nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned int));
+		memcpy(C, bytes+1+2*nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned int));	
+	
+		memcpy(t, bytes+1+3*nodeCount*sizeof(unsigned int), nodeCount*sizeof(unsigned char));			
+					
+		node root = new_node2(huffmanTree,0,0);
+		unpad_tree_uint(huffmanTree,L,R,C,t,0,root);
+		free(L);
+		free(R);
+		free(C);
+		free(t);
+		return root;
+	}
+}
+
+void encode_withTree(HuffmanTree* huffmanTree, int *s, size_t length, unsigned char **out, size_t *outSize)
+{
+	size_t i, nodeCount = 0;
+	unsigned char *treeBytes, buffer[4];
+	
+	init(huffmanTree, s, length);
+	for (i = 0; i < huffmanTree->stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++;
+	nodeCount = nodeCount*2-1;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree,nodeCount, &treeBytes);
+	//printf("treeByteSize=%d\n", treeByteSize);
+	*out = (unsigned char*)malloc(length*sizeof(int)+treeByteSize);
+	intToBytes_bigEndian(buffer, nodeCount);
+	memcpy(*out, buffer, 4);
+	memcpy(*out+4, treeBytes, treeByteSize);
+	free(treeBytes);
+	size_t enCodeSize = 0;
+	encode(huffmanTree, s, length, *out+4+treeByteSize, &enCodeSize);
+	*outSize = 4+treeByteSize+enCodeSize;
+	
+	//unsigned short state[length];
+	//decode(*out+4+treeByteSize, enCodeSize, qqq[0], state);
+	//printf("dataSeriesLength=%d",length );
+}
+
+/**
+ * @par *out rememmber to allocate targetLength short_type data for it beforehand.
+ * 
+ * */
+void decode_withTree(HuffmanTree* huffmanTree, unsigned char *s, size_t targetLength, int *out)
+{
+	size_t encodeStartIndex;
+	size_t nodeCount = bytesToInt_bigEndian(s);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree,s+4, nodeCount);
+	
+	//sdi: Debug
+/*	build_code(root, 0, 0, 0);
+	int i;
+	unsigned long code_1, code_2;
+	for (i = 0; i < stateNum; i++)
+		if (code[i])
+		{		
+			printf("%d: %lu,%lu ; %u\n", i, (code[i])[0],(code[i])[1], cout[i]);
+			//code_1 = (code[i])[0];
+		}*/
+	
+	if(nodeCount<=256)
+		encodeStartIndex = 1+3*nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);
+	else if(nodeCount<=65536)
+		encodeStartIndex = 1+2*nodeCount*sizeof(unsigned short)+nodeCount*sizeof(unsigned char)+nodeCount*sizeof(unsigned int);
+	else
+		encodeStartIndex = 1+3*nodeCount*sizeof(unsigned int)+nodeCount*sizeof(unsigned char);
+	decode(s+4+encodeStartIndex, targetLength, root, out);
+}
+
+void SZ_ReleaseHuffman(HuffmanTree* huffmanTree)
+{
+	size_t i;
+	free(huffmanTree->pool);
+	huffmanTree->pool = NULL;
+	free(huffmanTree->qqq);
+	huffmanTree->qqq = NULL;
+	for(i=0;i<huffmanTree->stateNum;i++)
+	{
+		if(huffmanTree->code[i]!=NULL)
+			free(huffmanTree->code[i]);
+	}
+	free(huffmanTree->code);
+	huffmanTree->code = NULL;
+	free(huffmanTree->cout);
+	huffmanTree->cout = NULL;	
+	free(huffmanTree);
+	huffmanTree = NULL;
+}
diff --git a/thirdparty/SZ/sz/src/TightDataPointStorageD.c b/thirdparty/SZ/sz/src/TightDataPointStorageD.c
new file mode 100644
index 0000000000000000000000000000000000000000..6ece9dbb56dfaa538d9796b9ab16f998d72bc474
--- /dev/null
+++ b/thirdparty/SZ/sz/src/TightDataPointStorageD.c
@@ -0,0 +1,743 @@
+/**
+ *  @file TightPointDataStorageD.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief The functions used to construct the tightPointDataStorage element for storing compressed bytes.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "TightDataPointStorageD.h"
+#include "sz.h"
+#include "Huffman.h"
+//#include "rw.h"
+
+void new_TightDataPointStorageD_Empty(TightDataPointStorageD **this)
+{
+	*this = (TightDataPointStorageD*)malloc(sizeof(TightDataPointStorageD));
+	(*this)->dataSeriesLength = 0;
+	(*this)->allSameData = 0;
+	(*this)->exactDataNum = 0;
+	(*this)->reservedValue = 0;
+	(*this)->reqLength = 0;
+	(*this)->radExpo = 0;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	(*this)->typeArray = NULL; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	(*this)->typeArray_size = 0;
+
+	(*this)->leadNumArray = NULL; //its size is exactDataNum/4 (or exactDataNum/4+1)
+	(*this)->leadNumArray_size = 0;
+
+	(*this)->exactMidBytes = NULL;
+	(*this)->exactMidBytes_size = 0;
+
+	(*this)->residualMidBits = NULL;
+	(*this)->residualMidBits_size = 0;
+	
+	(*this)->intervals = 0;
+	(*this)->isLossless = 0;
+	
+	(*this)->segment_size = 0;
+	(*this)->pwrErrBoundBytes = NULL;
+	(*this)->pwrErrBoundBytes_size = 0;
+}
+
+int new_TightDataPointStorageD_fromFlatBytes(TightDataPointStorageD **this, unsigned char* flatBytes, size_t flatBytesLength)
+{
+	new_TightDataPointStorageD_Empty(this);
+	size_t i, index = 0;
+	size_t pwrErrBoundBytes_size = 0, segmentL = 0, radExpoL = 0, pwrErrBoundBytesL = 0;
+	char version[3];
+	for (i = 0; i < 3; i++)
+		version[i] = flatBytes[index++]; //3
+	unsigned char sameRByte = flatBytes[index++]; //1
+	if(checkVersion(version)!=1)
+	{
+		//wrong version
+		printf("Wrong version: \nCompressed-data version (%d.%d.%d)\n",version[0], version[1], version[2]);
+		printf("Current sz version: (%d.%d.%d)\n", versionNumber[0], versionNumber[1], versionNumber[2]);
+		printf("Please double-check if the compressed data (or file) is correct.\n");
+		exit(0);
+	}
+
+	int same = sameRByte & 0x01;
+	//confparams_dec->szMode = (sameRByte & 0x06)>>1;
+	(*this)->isLossless = (sameRByte & 0x10)>>4;
+	int isPW_REL = (sameRByte & 0x20)>>5;
+	exe_params->SZ_SIZE_TYPE = ((sameRByte & 0x40)>>6)==1?8:4;
+	
+	int errorBoundMode = ABS;
+	if(isPW_REL)
+	{
+		errorBoundMode = PW_REL;
+		segmentL = exe_params->SZ_SIZE_TYPE;
+		pwrErrBoundBytesL = 4;
+	}
+	
+	sz_params* params = convertBytesToSZParams(&(flatBytes[index]));
+	int mode = confparams_dec->szMode;
+	int predictionMode = confparams_dec->predictionMode;
+	if(confparams_dec!=NULL)
+		free(confparams_dec);
+	confparams_dec = params;
+	confparams_dec->szMode = mode;
+	if(mode==SZ_TEMPORAL_COMPRESSION)
+	{
+		confparams_dec->szMode = SZ_TEMPORAL_COMPRESSION;
+		confparams_dec->predictionMode = predictionMode;
+	}
+	index += MetaDataByteLength;
+
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		dsLengthBytes[i] = flatBytes[index++];
+	(*this)->dataSeriesLength = bytesToSize(dsLengthBytes);
+
+	//printf("confparams_dec->szMode=%d\n",confparams_dec->szMode);
+
+	if((*this)->isLossless==1)
+	{
+		//(*this)->exactMidBytes = flatBytes+8;
+		return errorBoundMode;
+	}
+	else if(same==1)
+	{
+		(*this)->allSameData = 1;
+		size_t exactMidBytesLength = sizeof(double);//flatBytesLength - 3 - 1 - MetaDataByteLength -exe_params->SZ_SIZE_TYPE;
+		if(exactMidBytesLength>0)
+			(*this)->exactMidBytes = (unsigned char*)malloc(sizeof(unsigned char)*exactMidBytesLength);
+		else
+			(*this)->exactMidBytes = NULL;
+		for(i = 0;i<exactMidBytesLength;i++)
+			(*this)->exactMidBytes[i] = flatBytes[index++];
+		return errorBoundMode;
+	}
+	else
+		(*this)->allSameData = 0;
+		
+	int rtype_ = sameRByte & 0x08; //1000		
+
+	unsigned char byteBuf[8];
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	int max_quant_intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	confparams_dec->maxRangeRadius = max_quant_intervals/2;
+
+	if(errorBoundMode>=PW_REL)
+	{
+		(*this)->radExpo = flatBytes[index++];//1
+		radExpoL = 1;
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			byteBuf[i] = flatBytes[index++];
+		params->segment_size = (*this)->segment_size = bytesToSize(byteBuf);// exe_params->SZ_SIZE_TYPE	
+
+		for (i = 0; i < 4; i++)
+			byteBuf[i] = flatBytes[index++];
+		pwrErrBoundBytes_size = (*this)->pwrErrBoundBytes_size = bytesToInt_bigEndian(byteBuf);// 4		
+	}
+	else
+	{
+		pwrErrBoundBytes_size = 0;
+		(*this)->pwrErrBoundBytes = NULL;
+	}
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->medianValue = bytesToDouble(byteBuf);//8
+
+	(*this)->reqLength = flatBytes[index++]; //1
+
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->realPrecision = bytesToDouble(byteBuf);//8
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->typeArray_size = bytesToSize(byteBuf);// exe_params->SZ_SIZE_TYPE	
+
+	if(rtype_!=0)
+	{
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++) 
+			byteBuf[i] = flatBytes[index++];
+		(*this)->rtypeArray_size = bytesToSize(byteBuf);//ST		
+	}
+	else
+		(*this)->rtypeArray_size = 0;
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactDataNum = bytesToSize(byteBuf);// ST
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactMidBytes_size = bytesToSize(byteBuf);// ST
+
+	if (rtype_ != 0) {
+		if((*this)->rtypeArray_size>0)
+			(*this)->rtypeArray = (unsigned char*)malloc(sizeof(unsigned char)*(*this)->rtypeArray_size);
+		else
+			(*this)->rtypeArray = NULL;
+
+		for (i = 0; i < 8; i++)
+			byteBuf[i] = flatBytes[index++];
+		(*this)->reservedValue = bytesToDouble(byteBuf);//8
+	}
+
+	size_t logicLeadNumBitsNum = (*this)->exactDataNum * 2;
+	if (logicLeadNumBitsNum % 8 == 0)
+	{
+		(*this)->leadNumArray_size = logicLeadNumBitsNum >> 3;
+	}
+	else
+	{
+		(*this)->leadNumArray_size = (logicLeadNumBitsNum >> 3) + 1;
+	}
+
+	if ((*this)->rtypeArray != NULL) 
+	{
+		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 8 - 1 - 8 
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - 8 - (*this)->rtypeArray_size 
+				- (*this)->typeArray_size - (*this)->leadNumArray_size
+				- (*this)->exactMidBytes_size - pwrErrBoundBytes_size;
+		for (i = 0; i < (*this)->rtypeArray_size; i++)
+			(*this)->rtypeArray[i] = flatBytes[index++];
+	}
+	else
+	{
+		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 8 - 1 - 8
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - (*this)->typeArray_size
+				- (*this)->leadNumArray_size - (*this)->exactMidBytes_size - pwrErrBoundBytes_size;
+	}	
+
+	(*this)->typeArray = &flatBytes[index];
+	//retrieve the number of states (i.e., stateNum)
+	(*this)->allNodes = bytesToInt_bigEndian((*this)->typeArray); //the first 4 bytes store the stateNum
+	(*this)->stateNum = ((*this)->allNodes+1)/2;	
+
+	index+=(*this)->typeArray_size;
+	
+	(*this)->pwrErrBoundBytes = &flatBytes[index];
+	
+	index+=pwrErrBoundBytes_size;
+	
+	(*this)->leadNumArray = &flatBytes[index];
+	
+	index+=(*this)->leadNumArray_size;
+	
+	(*this)->exactMidBytes = &flatBytes[index];
+	
+	index+=(*this)->exactMidBytes_size;
+	
+	(*this)->residualMidBits = &flatBytes[index];
+	
+	//index+=(*this)->residualMidBits_size;
+	
+	return errorBoundMode;
+}
+
+/**
+ * 
+ * type's length == dataSeriesLength
+ * exactMidBytes's length == exactMidBytes_size
+ * leadNumIntArray's length == exactDataNum
+ * escBytes's length == escBytes_size
+ * resiBitLength's length == resiBitLengthSize
+ * */
+void new_TightDataPointStorageD(TightDataPointStorageD **this, 
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char resiBitLength, 
+		double realPrecision, double medianValue, char reqLength, unsigned int intervals,
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo) {
+	//int i = 0;
+	*this = (TightDataPointStorageD *)malloc(sizeof(TightDataPointStorageD));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->medianValue = medianValue;
+	(*this)->reqLength = reqLength;
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+		
+	(*this)->exactMidBytes = exactMidBytes;
+	(*this)->exactMidBytes_size = exactMidBytes_size;
+
+	(*this)->leadNumArray_size = convertIntArray2ByteArray_fast_2b(leadNumIntArray, exactDataNum, &((*this)->leadNumArray));
+
+	(*this)->residualMidBits_size = convertIntArray2ByteArray_fast_dynamic(resiMidBits, resiBitLength, exactDataNum, &((*this)->residualMidBits));
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		(*this)->pwrErrBoundBytes = pwrErrBoundBytes;
+	else
+		(*this)->pwrErrBoundBytes = NULL;
+		
+	(*this)->radExpo = radExpo;
+	
+	(*this)->pwrErrBoundBytes_size = pwrErrBoundBytes_size;
+}
+
+void new_TightDataPointStorageD2(TightDataPointStorageD **this, 
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char* resiBitLength, size_t resiBitLengthSize,
+		double realPrecision, double medianValue, char reqLength, unsigned int intervals,
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo) {
+	//int i = 0;
+	*this = (TightDataPointStorageD *)malloc(sizeof(TightDataPointStorageD));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->medianValue = medianValue;
+	(*this)->reqLength = reqLength;
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	(*this)->exactMidBytes = exactMidBytes;
+	(*this)->exactMidBytes_size = exactMidBytes_size;
+
+	(*this)->leadNumArray_size = convertIntArray2ByteArray_fast_2b(leadNumIntArray, exactDataNum, &((*this)->leadNumArray));
+
+	//(*this)->residualMidBits = resiMidBits;
+	//(*this)->residualMidBits_size = resiMidBits_size;
+
+	(*this)->residualMidBits_size = convertIntArray2ByteArray_fast_dynamic2(resiMidBits, resiBitLength, resiBitLengthSize, &((*this)->residualMidBits));
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		(*this)->pwrErrBoundBytes = pwrErrBoundBytes;
+	else
+		(*this)->pwrErrBoundBytes = NULL;
+		
+	(*this)->radExpo = radExpo;
+	
+	(*this)->pwrErrBoundBytes_size = pwrErrBoundBytes_size;
+}
+
+void convertTDPStoBytes_double(TightDataPointStorageD* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	unsigned char intervalsBytes[4];
+	unsigned char typeArrayLengthBytes[8];
+	unsigned char exactLengthBytes[8];
+	unsigned char exactMidBytesLength[8];
+	unsigned char realPrecisionBytes[8];
+	
+	unsigned char medianValueBytes[8];
+	
+	unsigned char segment_sizeBytes[8];
+	unsigned char pwrErrBoundBytes_sizeBytes[4];
+	unsigned char max_quant_intervals_Bytes[4];
+	
+	for(i = 0;i<3;i++)//3 bytes
+		bytes[k++] = versionNumber[i];
+	bytes[k++] = sameByte;	//1	byte	
+	
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength;
+	
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST: 4 or 8 bytes
+		bytes[k++] = dsLengthBytes[i];	
+	intToBytes_bigEndian(max_quant_intervals_Bytes, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = max_quant_intervals_Bytes[i];		
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		bytes[k++] = tdps->radExpo; //1 byte			
+		
+		sizeToBytes(segment_sizeBytes, confparams_cpr->segment_size);
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+			bytes[k++] = segment_sizeBytes[i];				
+			
+		intToBytes_bigEndian(pwrErrBoundBytes_sizeBytes, tdps->pwrErrBoundBytes_size);
+		for(i = 0;i<4;i++)//4
+			bytes[k++] = pwrErrBoundBytes_sizeBytes[i];					
+	}
+	
+	intToBytes_bigEndian(intervalsBytes, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = intervalsBytes[i];		
+	
+	doubleToBytes(medianValueBytes, tdps->medianValue);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = medianValueBytes[i];		
+
+	bytes[k++] = tdps->reqLength; //1 byte
+
+	/*if(errorBoundMode>=PW_REL)
+		doubleToBytes(realPrecisionBytes, pw_relBoundRatio);
+	else*/
+	doubleToBytes(realPrecisionBytes, tdps->realPrecision);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = realPrecisionBytes[i];
+			
+	sizeToBytes(typeArrayLengthBytes, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = typeArrayLengthBytes[i];				
+				
+	sizeToBytes(exactLengthBytes, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactLengthBytes[i];
+
+	sizeToBytes(exactMidBytesLength, tdps->exactMidBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactMidBytesLength[i];
+
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		memcpy(&(bytes[k]), tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size);
+		k += tdps->pwrErrBoundBytes_size;
+	}
+
+	memcpy(&(bytes[k]), tdps->leadNumArray, tdps->leadNumArray_size);
+	k += tdps->leadNumArray_size;
+	memcpy(&(bytes[k]), tdps->exactMidBytes, tdps->exactMidBytes_size);
+	k += tdps->exactMidBytes_size;
+
+	if(tdps->residualMidBits!=NULL)
+	{
+		memcpy(&(bytes[k]), tdps->residualMidBits, tdps->residualMidBits_size);
+		k += tdps->residualMidBits_size;
+	}		
+}
+
+void convertTDPStoBytes_double_reserve(TightDataPointStorageD* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	unsigned char intervalsBytes[4];
+	unsigned char typeArrayLengthBytes[8];
+	unsigned char rTypeLengthBytes[8];
+	unsigned char exactLengthBytes[8];
+	unsigned char exactMidBytesLength[8];
+	unsigned char reservedValueBytes[8];
+	unsigned char realPrecisionBytes[8];
+	
+	unsigned char medianValueBytes[8];
+	
+	unsigned char segment_sizeBytes[8];
+	unsigned char pwrErrBoundBytes_sizeBytes[4];
+	unsigned char max_quant_intervals_Bytes[4];	
+	
+	for(i = 0;i<3;i++)//3
+		bytes[k++] = versionNumber[i];		
+	bytes[k++] = sameByte;			//1
+
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength;
+	
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = dsLengthBytes[i];		
+
+	intToBytes_bigEndian(max_quant_intervals_Bytes, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = max_quant_intervals_Bytes[i];
+
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		bytes[k++] = tdps->radExpo; //1 byte			
+		
+		sizeToBytes(segment_sizeBytes, confparams_cpr->segment_size);
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//4
+			bytes[k++] = segment_sizeBytes[i];				
+			
+		intToBytes_bigEndian(pwrErrBoundBytes_sizeBytes, tdps->pwrErrBoundBytes_size);
+		for(i = 0;i<4;i++)//4
+			bytes[k++] = pwrErrBoundBytes_sizeBytes[i];					
+	}
+	intToBytes_bigEndian(intervalsBytes, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = intervalsBytes[i];	
+
+	doubleToBytes(medianValueBytes, tdps->medianValue);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = medianValueBytes[i];		
+
+	bytes[k++] = tdps->reqLength; //1 byte
+
+	doubleToBytes(realPrecisionBytes, tdps->realPrecision);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = realPrecisionBytes[i];		
+	
+	sizeToBytes(typeArrayLengthBytes, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = typeArrayLengthBytes[i];			
+	
+	sizeToBytes(rTypeLengthBytes, tdps->rtypeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = rTypeLengthBytes[i];	
+	
+	sizeToBytes(exactLengthBytes, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactLengthBytes[i];
+
+	sizeToBytes(exactMidBytesLength, tdps->exactMidBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactMidBytesLength[i];
+
+	doubleToBytes(reservedValueBytes, tdps->reservedValue);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = reservedValueBytes[i];
+	
+	memcpy(&(bytes[k]), tdps->rtypeArray, tdps->rtypeArray_size);
+	k += tdps->rtypeArray_size;		
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		memcpy(&(bytes[k]), tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size);
+		k += tdps->pwrErrBoundBytes_size;
+	}
+	memcpy(&(bytes[k]), tdps->leadNumArray, tdps->leadNumArray_size);
+	k += tdps->leadNumArray_size;
+	memcpy(&(bytes[k]), tdps->exactMidBytes, tdps->exactMidBytes_size);
+	k += tdps->exactMidBytes_size;		
+	if(tdps->residualMidBits!=NULL)
+	{
+		memcpy(&(bytes[k]), tdps->residualMidBits, tdps->residualMidBits_size);
+		k += tdps->residualMidBits_size;	
+	}	
+}
+
+//Convert TightDataPointStorageD to bytes...
+void convertTDPStoFlatBytes_double(TightDataPointStorageD *tdps, unsigned char** bytes, size_t *size) 
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+	
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		sameByte = (unsigned char) (sameByte | 0x20); // 00100000, the 5th bit
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+		
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactMidBytes_size;
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+	
+		for (i = 0; i < 3; i++)//3
+			(*bytes)[k++] = versionNumber[i];
+		(*bytes)[k++] = sameByte;
+
+		convertSZParamsToBytes(confparams_cpr, &((*bytes)[k]));
+		k = k + MetaDataByteLength;
+
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			(*bytes)[k++] = dsLengthBytes[i];
+		
+		for (i = 0; i < tdps->exactMidBytes_size; i++)
+			(*bytes)[k++] = tdps->exactMidBytes[i];
+		
+		*size = totalByteLength;
+	}
+	else if (tdps->rtypeArray == NULL) 
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 8 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE 
+				+ tdps->typeArray_size + tdps->leadNumArray_size
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		convertTDPStoBytes_double(tdps, *bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+	else //the case with reserved value
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 8 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + 8 + tdps->rtypeArray_size
+				+ tdps->typeArray_size + tdps->leadNumArray_size 
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+
+		sameByte = (unsigned char) (sameByte | 0x08); // 00001000, the 4th bit
+												// denotes whether it is
+												// with "reserved value"
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+			sameByte = (unsigned char) (sameByte | 0x10); // 00001000, the 5th bit
+
+		*bytes = (unsigned char*)malloc(sizeof(unsigned char)*totalByteLength);
+		
+		convertTDPStoBytes_double_reserve(tdps, *bytes, dsLengthBytes, sameByte);
+
+		*size = totalByteLength;
+	}
+}
+
+void convertTDPStoFlatBytes_double_args(TightDataPointStorageD *tdps, unsigned char* bytes, size_t *size) 
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+		
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		sameByte = (unsigned char) (sameByte | 0x20); // 00100000, the 5th bit
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); //01000000, the 6th bit
+
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactMidBytes_size;
+	
+		for (i = 0; i < 3; i++)//3
+			bytes[k++] = versionNumber[i];
+		bytes[k++] = sameByte;
+		
+		convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+		k = k + MetaDataByteLength;
+				
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			bytes[k++] = dsLengthBytes[i];		
+		for (i = 0; i < tdps->exactMidBytes_size; i++)
+			bytes[k++] = tdps->exactMidBytes[i];
+		
+		*size = totalByteLength;
+	}
+	else if (tdps->rtypeArray == NULL) 
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE+ 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 8 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE 
+				+ tdps->typeArray_size + tdps->leadNumArray_size
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+
+		convertTDPStoBytes_double(tdps, bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+	else //the case with reserved value
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 8 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + 8 + tdps->rtypeArray_size
+				+ tdps->typeArray_size + tdps->leadNumArray_size 
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+
+		sameByte = (unsigned char) (sameByte | 0x08); // 00001000, the 4th bit
+												// denotes whether it is
+												// with "reserved value"
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+			sameByte = (unsigned char) (sameByte | 0x10); // 00010000, the 5th bit
+		
+		convertTDPStoBytes_double_reserve(tdps, bytes, dsLengthBytes, sameByte);
+
+		*size = totalByteLength;
+	}
+}
+
+
+void free_TightDataPointStorageD(TightDataPointStorageD *tdps)
+{
+	if(tdps->rtypeArray!=NULL)
+		free(tdps->rtypeArray);
+	if(tdps->typeArray!=NULL)
+		free(tdps->typeArray);
+	if(tdps->leadNumArray!=NULL)
+		free(tdps->leadNumArray);
+	if(tdps->exactMidBytes!=NULL)
+		free(tdps->exactMidBytes);
+	if(tdps->residualMidBits!=NULL)
+		free(tdps->residualMidBits);
+	if(tdps->pwrErrBoundBytes!=NULL) 	
+		free(tdps->pwrErrBoundBytes);
+	free(tdps);
+}
+
+/**
+ * to free the memory used in the decompression
+ * */
+void free_TightDataPointStorageD2(TightDataPointStorageD *tdps)
+{			
+	free(tdps);
+}
diff --git a/thirdparty/SZ/sz/src/TightDataPointStorageF.c b/thirdparty/SZ/sz/src/TightDataPointStorageF.c
new file mode 100644
index 0000000000000000000000000000000000000000..23a69b8a81c67e52717d1e834461edbe916df7d2
--- /dev/null
+++ b/thirdparty/SZ/sz/src/TightDataPointStorageF.c
@@ -0,0 +1,746 @@
+/**
+ *  @file TightPointDataStorageF.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief The functions used to construct the tightPointDataStorage element for storing compressed bytes.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "TightDataPointStorageF.h"
+#include "sz.h"
+#include "Huffman.h"
+//#include "rw.h"
+
+void new_TightDataPointStorageF_Empty(TightDataPointStorageF **this)
+{
+	*this = (TightDataPointStorageF*)malloc(sizeof(TightDataPointStorageF));
+	(*this)->dataSeriesLength = 0;
+	(*this)->allSameData = 0;
+	(*this)->exactDataNum = 0;
+	(*this)->reservedValue = 0;
+	(*this)->reqLength = 0;
+	(*this)->radExpo = 0;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	(*this)->typeArray = NULL; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	(*this)->typeArray_size = 0;
+
+	(*this)->leadNumArray = NULL; //its size is exactDataNum/4 (or exactDataNum/4+1)
+	(*this)->leadNumArray_size = 0;
+
+	(*this)->exactMidBytes = NULL;
+	(*this)->exactMidBytes_size = 0;
+
+	(*this)->residualMidBits = NULL;
+	(*this)->residualMidBits_size = 0;
+	
+	(*this)->intervals = 0;
+	(*this)->isLossless = 0;
+	
+	(*this)->segment_size = 0;
+	(*this)->pwrErrBoundBytes = NULL;
+	(*this)->pwrErrBoundBytes_size = 0;	
+}
+
+int new_TightDataPointStorageF_fromFlatBytes(TightDataPointStorageF **this, unsigned char* flatBytes, size_t flatBytesLength)
+{
+	new_TightDataPointStorageF_Empty(this);
+	size_t i, index = 0;
+	size_t pwrErrBoundBytes_size = 0, segmentL = 0, radExpoL = 0, pwrErrBoundBytesL = 0;
+	char version[3];
+	for (i = 0; i < 3; i++)
+		version[i] = flatBytes[index++]; //3
+	unsigned char sameRByte = flatBytes[index++]; //1
+	if(checkVersion(version)!=1)
+	{
+		//wrong version
+		printf("Wrong version: \nCompressed-data version (%d.%d.%d)\n",version[0], version[1], version[2]);
+		printf("Current sz version: (%d.%d.%d)\n", versionNumber[0], versionNumber[1], versionNumber[2]);
+		printf("Please double-check if the compressed data (or file) is correct.\n");
+		exit(0);
+	}
+	int same = sameRByte & 0x01;
+	//confparams_dec->szMode = (sameRByte & 0x06)>>1;
+	(*this)->isLossless = (sameRByte & 0x10)>>4;
+	int isPW_REL = (sameRByte & 0x20)>>5;
+	exe_params->SZ_SIZE_TYPE = ((sameRByte & 0x40)>>6)==1?8:4;
+	int errorBoundMode = ABS;
+	if(isPW_REL)
+	{
+		errorBoundMode = PW_REL;
+		segmentL = exe_params->SZ_SIZE_TYPE;
+		pwrErrBoundBytesL = 4;
+	}
+	
+	sz_params* params = convertBytesToSZParams(&(flatBytes[index]));
+	int mode = confparams_dec->szMode;
+	int predictionMode = confparams_dec->predictionMode;
+	if(confparams_dec!=NULL)
+		free(confparams_dec);
+	confparams_dec = params;
+	confparams_dec->szMode = mode;
+	if(mode==SZ_TEMPORAL_COMPRESSION)
+	{
+		confparams_dec->szMode = SZ_TEMPORAL_COMPRESSION;
+		confparams_dec->predictionMode = predictionMode;
+	}
+	
+	index += MetaDataByteLength;
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		dsLengthBytes[i] = flatBytes[index++];
+	(*this)->dataSeriesLength = bytesToSize(dsLengthBytes);// 4 or 8	
+	
+	if((*this)->isLossless==1)
+	{
+		//(*this)->exactMidBytes = flatBytes+8;
+		return errorBoundMode;
+	}
+	else if(same==1)
+	{
+		(*this)->allSameData = 1;
+		size_t exactMidBytesLength = sizeof(float); //flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE;
+		if(exactMidBytesLength>0)
+			(*this)->exactMidBytes = (unsigned char*)malloc(sizeof(unsigned char)*exactMidBytesLength);
+		else
+			(*this)->exactMidBytes = NULL;
+		for(i = 0;i<exactMidBytesLength;i++)
+			(*this)->exactMidBytes[i] = flatBytes[index++];
+		return errorBoundMode;
+	}
+	else
+		(*this)->allSameData = 0;
+
+	int rtype_ = sameRByte & 0x08;		//=00001000
+	unsigned char byteBuf[8];
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	int max_quant_intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	confparams_dec->maxRangeRadius = max_quant_intervals/2;
+
+	if(errorBoundMode>=PW_REL)
+	{
+		(*this)->radExpo = flatBytes[index++];//1
+		radExpoL = 1;
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			byteBuf[i] = flatBytes[index++];
+		params->segment_size = (*this)->segment_size = bytesToSize(byteBuf);// exe_params->SZ_SIZE_TYPE	
+
+		for (i = 0; i < 4; i++)
+			byteBuf[i] = flatBytes[index++];
+		pwrErrBoundBytes_size = (*this)->pwrErrBoundBytes_size = bytesToInt_bigEndian(byteBuf);// 4		
+	}
+	else
+	{
+		pwrErrBoundBytes_size = 0;
+		(*this)->pwrErrBoundBytes = NULL;
+	}
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->medianValue = bytesToFloat(byteBuf); //4
+	
+	(*this)->reqLength = flatBytes[index++]; //1
+	
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->realPrecision = bytesToDouble(byteBuf);//8
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->typeArray_size = bytesToSize(byteBuf);// 4		
+	if(rtype_!=0)
+	{
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++) 
+			byteBuf[i] = flatBytes[index++];
+		(*this)->rtypeArray_size = bytesToSize(byteBuf);//(ST)		
+	}
+	else
+		(*this)->rtypeArray_size = 0;
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactDataNum = bytesToSize(byteBuf);// ST
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactMidBytes_size = bytesToSize(byteBuf);// ST
+
+	if (rtype_ != 0) {
+		if((*this)->rtypeArray_size>0)
+			(*this)->rtypeArray = (unsigned char*)malloc(sizeof(unsigned char)*(*this)->rtypeArray_size);
+		else
+			(*this)->rtypeArray = NULL;
+
+		for (i = 0; i < 4; i++)
+			byteBuf[i] = flatBytes[index++];
+		(*this)->reservedValue = bytesToFloat(byteBuf);//4
+	}
+
+	size_t logicLeadNumBitsNum = (*this)->exactDataNum * 2;
+	if (logicLeadNumBitsNum % 8 == 0)
+	{
+		(*this)->leadNumArray_size = logicLeadNumBitsNum >> 3;
+	}
+	else
+	{
+		(*this)->leadNumArray_size = (logicLeadNumBitsNum >> 3) + 1;
+	}
+
+	if ((*this)->rtypeArray != NULL) 
+	{
+		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 4 - 1 - 8 
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - 4 - (*this)->rtypeArray_size
+				- (*this)->typeArray_size - (*this)->leadNumArray_size
+				- (*this)->exactMidBytes_size - pwrErrBoundBytes_size;
+		for (i = 0; i < (*this)->rtypeArray_size; i++)
+			(*this)->rtypeArray[i] = flatBytes[index++];
+	}
+	else
+	{
+		(*this)->residualMidBits_size = flatBytesLength - 3 - 1 - MetaDataByteLength - exe_params->SZ_SIZE_TYPE - 4 - radExpoL - segmentL - pwrErrBoundBytesL - 4 - 4 - 1 - 8 
+				- exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - exe_params->SZ_SIZE_TYPE - (*this)->typeArray_size
+				- (*this)->leadNumArray_size - (*this)->exactMidBytes_size - pwrErrBoundBytes_size;
+	}	
+
+	(*this)->typeArray = &flatBytes[index]; 
+	//retrieve the number of states (i.e., stateNum)
+	(*this)->allNodes = bytesToInt_bigEndian((*this)->typeArray); //the first 4 bytes store the stateNum
+	(*this)->stateNum = ((*this)->allNodes+1)/2;	
+
+	index+=(*this)->typeArray_size;
+	
+	(*this)->pwrErrBoundBytes = &flatBytes[index];
+	
+	index+=pwrErrBoundBytes_size;
+	
+	(*this)->leadNumArray = &flatBytes[index];
+	
+	index+=(*this)->leadNumArray_size;
+	
+	(*this)->exactMidBytes = &flatBytes[index];
+	
+	index+=(*this)->exactMidBytes_size;
+	
+	(*this)->residualMidBits = &flatBytes[index];
+	
+	//index+=(*this)->residualMidBits_size;
+	
+	return errorBoundMode;
+}
+
+/**
+ *
+ * type's length == dataSeriesLength
+ * exactMidBytes's length == exactMidBytes_size
+ * leadNumIntArray's length == exactDataNum
+ * escBytes's length == escBytes_size
+ * resiBitLength's length == resiBitLengthSize
+ * */
+void new_TightDataPointStorageF(TightDataPointStorageF **this,
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char resiBitLength, 
+		double realPrecision, float medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo) {
+	
+	*this = (TightDataPointStorageF *)malloc(sizeof(TightDataPointStorageF));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->medianValue = medianValue;
+	(*this)->reqLength = reqLength;
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+		
+	(*this)->exactMidBytes = exactMidBytes;
+	(*this)->exactMidBytes_size = exactMidBytes_size;
+
+	(*this)->leadNumArray_size = convertIntArray2ByteArray_fast_2b(leadNumIntArray, exactDataNum, &((*this)->leadNumArray));
+
+	(*this)->residualMidBits_size = convertIntArray2ByteArray_fast_dynamic(resiMidBits, resiBitLength, exactDataNum, &((*this)->residualMidBits));
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		(*this)->pwrErrBoundBytes = pwrErrBoundBytes;
+	else
+		(*this)->pwrErrBoundBytes = NULL;
+		
+	(*this)->radExpo = radExpo;
+	
+	(*this)->pwrErrBoundBytes_size = pwrErrBoundBytes_size;
+}
+
+void new_TightDataPointStorageF2(TightDataPointStorageF **this,
+		size_t dataSeriesLength, size_t exactDataNum, 
+		int* type, unsigned char* exactMidBytes, size_t exactMidBytes_size,
+		unsigned char* leadNumIntArray,  //leadNumIntArray contains readable numbers....
+		unsigned char* resiMidBits, size_t resiMidBits_size,
+		unsigned char* resiBitLength, size_t resiBitLengthSize, 
+		double realPrecision, float medianValue, char reqLength, unsigned int intervals, 
+		unsigned char* pwrErrBoundBytes, size_t pwrErrBoundBytes_size, unsigned char radExpo) {
+	//int i = 0;
+	*this = (TightDataPointStorageF *)malloc(sizeof(TightDataPointStorageF));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->medianValue = medianValue;
+	(*this)->reqLength = reqLength;
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+
+	(*this)->rtypeArray = NULL;
+	(*this)->rtypeArray_size = 0;
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	(*this)->exactMidBytes = exactMidBytes;
+	(*this)->exactMidBytes_size = exactMidBytes_size;
+
+	(*this)->leadNumArray_size = convertIntArray2ByteArray_fast_2b(leadNumIntArray, exactDataNum, &((*this)->leadNumArray));
+
+	//(*this)->residualMidBits = resiMidBits;
+	//(*this)->residualMidBits_size = resiMidBits_size;
+
+	(*this)->residualMidBits_size = convertIntArray2ByteArray_fast_dynamic2(resiMidBits, resiBitLength, resiBitLengthSize, &((*this)->residualMidBits));
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		(*this)->pwrErrBoundBytes = pwrErrBoundBytes;
+	else
+		(*this)->pwrErrBoundBytes = NULL;
+		
+	(*this)->radExpo = radExpo;
+	
+	(*this)->pwrErrBoundBytes_size = pwrErrBoundBytes_size;
+}
+
+void convertTDPStoBytes_float(TightDataPointStorageF* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	unsigned char intervalsBytes[4];
+	unsigned char typeArrayLengthBytes[8];
+	unsigned char exactLengthBytes[8];
+	unsigned char exactMidBytesLength[8];
+	unsigned char realPrecisionBytes[8];
+	
+	unsigned char medianValueBytes[4];
+	
+	unsigned char segment_sizeBytes[8];
+	unsigned char pwrErrBoundBytes_sizeBytes[4];
+	unsigned char max_quant_intervals_Bytes[4];
+	
+	
+	for(i = 0;i<3;i++)//3 bytes
+		bytes[k++] = versionNumber[i];
+	bytes[k++] = sameByte;	//1	byte	
+	
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength;
+	
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST: 4 or 8 bytes
+		bytes[k++] = dsLengthBytes[i];	
+	intToBytes_bigEndian(max_quant_intervals_Bytes, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = max_quant_intervals_Bytes[i];		
+	
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		bytes[k++] = tdps->radExpo; //1 byte			
+		
+		sizeToBytes(segment_sizeBytes, confparams_cpr->segment_size);
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+			bytes[k++] = segment_sizeBytes[i];				
+			
+		intToBytes_bigEndian(pwrErrBoundBytes_sizeBytes, tdps->pwrErrBoundBytes_size);
+		for(i = 0;i<4;i++)//4
+			bytes[k++] = pwrErrBoundBytes_sizeBytes[i];					
+	}
+	
+	intToBytes_bigEndian(intervalsBytes, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = intervalsBytes[i];			
+	
+	floatToBytes(medianValueBytes, tdps->medianValue);
+	for (i = 0; i < 4; i++)// 4
+		bytes[k++] = medianValueBytes[i];		
+
+	bytes[k++] = tdps->reqLength; //1 byte
+
+/*	if(errorBoundMode>=PW_REL)
+		doubleToBytes(realPrecisionBytes, pw_relBoundRatio);
+	else*/
+	doubleToBytes(realPrecisionBytes, tdps->realPrecision);
+
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = realPrecisionBytes[i];			
+
+	sizeToBytes(typeArrayLengthBytes, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = typeArrayLengthBytes[i];
+
+	sizeToBytes(exactLengthBytes, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactLengthBytes[i];
+
+	sizeToBytes(exactMidBytesLength, tdps->exactMidBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactMidBytesLength[i];
+
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		memcpy(&(bytes[k]), tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size);
+		k += tdps->pwrErrBoundBytes_size;
+	}
+
+	memcpy(&(bytes[k]), tdps->leadNumArray, tdps->leadNumArray_size);
+	k += tdps->leadNumArray_size;
+	memcpy(&(bytes[k]), tdps->exactMidBytes, tdps->exactMidBytes_size);
+	k += tdps->exactMidBytes_size;
+
+	if(tdps->residualMidBits!=NULL)
+	{
+		memcpy(&(bytes[k]), tdps->residualMidBits, tdps->residualMidBits_size);
+		k += tdps->residualMidBits_size;
+	}	
+}
+
+void convertTDPStoBytes_float_reserve(TightDataPointStorageF* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	unsigned char intervalsBytes[4];
+	unsigned char typeArrayLengthBytes[8];
+	unsigned char rTypeLengthBytes[8];
+	unsigned char exactLengthBytes[8];
+	unsigned char exactMidBytesLength[8];
+	unsigned char realPrecisionBytes[8];
+	unsigned char reservedValueBytes[4];
+	
+	unsigned char medianValueBytes[4];
+	
+	unsigned char segment_sizeBytes[8];
+	unsigned char pwrErrBoundBytes_sizeBytes[4];
+	unsigned char max_quant_intervals_Bytes[4];	
+	
+	for(i = 0;i<3;i++)//3
+		bytes[k++] = versionNumber[i];		
+	bytes[k++] = sameByte;			//1
+
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength;
+	
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = dsLengthBytes[i];		
+
+
+	intToBytes_bigEndian(max_quant_intervals_Bytes, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = max_quant_intervals_Bytes[i];
+
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		bytes[k++] = tdps->radExpo; //1 byte			
+		
+		sizeToBytes(segment_sizeBytes, confparams_cpr->segment_size);
+		for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+			bytes[k++] = segment_sizeBytes[i];				
+			
+		intToBytes_bigEndian(pwrErrBoundBytes_sizeBytes, tdps->pwrErrBoundBytes_size);
+		for(i = 0;i<4;i++)//4
+			bytes[k++] = pwrErrBoundBytes_sizeBytes[i];					
+	}
+	
+	intToBytes_bigEndian(intervalsBytes, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = intervalsBytes[i];	
+
+	floatToBytes(medianValueBytes, tdps->medianValue);
+	for (i = 0; i < 4; i++)// 4
+		bytes[k++] = medianValueBytes[i];		
+
+	bytes[k++] = tdps->reqLength; //1 byte
+
+	floatToBytes(realPrecisionBytes, tdps->realPrecision);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = realPrecisionBytes[i];
+
+	sizeToBytes(typeArrayLengthBytes, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = typeArrayLengthBytes[i];
+
+	sizeToBytes(rTypeLengthBytes, tdps->rtypeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = rTypeLengthBytes[i];
+
+	sizeToBytes(exactLengthBytes, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactLengthBytes[i];
+
+	sizeToBytes(exactMidBytesLength, tdps->exactMidBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = exactMidBytesLength[i];
+
+	floatToBytes(reservedValueBytes, tdps->reservedValue);
+	for (i = 0; i < 4; i++)// 4
+		bytes[k++] = reservedValueBytes[i];
+
+	memcpy(&(bytes[k]), tdps->rtypeArray, tdps->rtypeArray_size);
+	k += tdps->rtypeArray_size;
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+	{
+		memcpy(&(bytes[k]), tdps->pwrErrBoundBytes, tdps->pwrErrBoundBytes_size);
+		k += tdps->pwrErrBoundBytes_size;
+	}
+	memcpy(&(bytes[k]), tdps->leadNumArray, tdps->leadNumArray_size);
+	k += tdps->leadNumArray_size;
+	memcpy(&(bytes[k]), tdps->exactMidBytes, tdps->exactMidBytes_size);
+	k += tdps->exactMidBytes_size;
+	if(tdps->residualMidBits!=NULL)
+	{
+		memcpy(&(bytes[k]), tdps->residualMidBits, tdps->residualMidBits_size);
+		k += tdps->residualMidBits_size;
+	}	
+}
+
+//convert TightDataPointStorageD to bytes...
+void convertTDPStoFlatBytes_float(TightDataPointStorageF *tdps, unsigned char** bytes, size_t *size)
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+		
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		sameByte = (unsigned char) (sameByte | 0x20); // 00100000, the 5th bit
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactMidBytes_size;
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		for (i = 0; i < 3; i++)//3
+			(*bytes)[k++] = versionNumber[i];
+		(*bytes)[k++] = sameByte;
+		
+		convertSZParamsToBytes(confparams_cpr, &((*bytes)[k]));
+		k = k + MetaDataByteLength;
+				
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			(*bytes)[k++] = dsLengthBytes[i];
+		
+		for (i = 0; i < tdps->exactMidBytes_size; i++)
+			(*bytes)[k++] = tdps->exactMidBytes[i];
+
+		*size = totalByteLength;
+	}
+	else if (tdps->rtypeArray == NULL)
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 4 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE  
+				+ tdps->typeArray_size + tdps->leadNumArray_size 
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		convertTDPStoBytes_float(tdps, *bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+	else //the case with reserved value
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;		
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 4 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + 4 + tdps->rtypeArray_size
+				+ tdps->typeArray_size + tdps->leadNumArray_size
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+
+		sameByte = (unsigned char) (sameByte | 0x08); // 00001000, the 4th bit
+		// denotes whether it is
+		// with "reserved value"
+		
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+			sameByte = (unsigned char) (sameByte | 0x10); // 00001000, the 5th bit
+
+		*bytes = (unsigned char*)malloc(sizeof(unsigned char)*totalByteLength);
+
+		convertTDPStoBytes_float_reserve(tdps, *bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+}
+
+void convertTDPStoFlatBytes_float_args(TightDataPointStorageF *tdps, unsigned char* bytes, size_t *size)
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+		
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);
+	if(confparams_cpr->errorBoundMode>=PW_REL)
+		sameByte = (unsigned char) (sameByte | 0x20); // 00100000, the 5th bit
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+		
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactMidBytes_size;
+		//*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		for (i = 0; i < 3; i++)//3
+			bytes[k++] = versionNumber[i];
+		bytes[k++] = sameByte;
+
+		convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+		k = k + MetaDataByteLength;
+
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			bytes[k++] = dsLengthBytes[i];		
+		for (i = 0; i < tdps->exactMidBytes_size; i++)
+			bytes[k++] = tdps->exactMidBytes[i];
+
+		*size = totalByteLength;
+	}
+	else if (tdps->rtypeArray == NULL)
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 4 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE  
+				+ tdps->typeArray_size + tdps->leadNumArray_size 
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+
+		convertTDPStoBytes_float(tdps, bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+	else //the case with reserved value
+	{
+		size_t residualMidBitsLength = tdps->residualMidBits == NULL ? 0 : tdps->residualMidBits_size;
+		size_t segmentL = 0, radExpoL = 0, pwrBoundArrayL = 0;
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			radExpoL = 1;
+			pwrBoundArrayL = 4;
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrBoundArrayL + 4 + 4 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + 4 + tdps->rtypeArray_size
+				+ tdps->typeArray_size + tdps->leadNumArray_size
+				+ tdps->exactMidBytes_size + residualMidBitsLength + tdps->pwrErrBoundBytes_size;
+
+		sameByte = (unsigned char) (sameByte | 0x08); // 00001000, the 4th bit
+		// denotes whether it is
+		// with "reserved value"
+		
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+			sameByte = (unsigned char) (sameByte | 0x10); // 00001000, the 5th bit
+
+		convertTDPStoBytes_float_reserve(tdps, bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+}
+
+/**
+ * to free the memory used in the compression
+ * */
+void free_TightDataPointStorageF(TightDataPointStorageF *tdps)
+{			
+	if(tdps->rtypeArray!=NULL)
+		free(tdps->rtypeArray);
+	if(tdps->typeArray!=NULL)
+		free(tdps->typeArray);
+	if(tdps->leadNumArray!=NULL)
+		free(tdps->leadNumArray);
+	if(tdps->exactMidBytes!=NULL)
+		free(tdps->exactMidBytes);
+	if(tdps->residualMidBits!=NULL)
+		free(tdps->residualMidBits);
+	if(tdps->pwrErrBoundBytes!=NULL)
+		free(tdps->pwrErrBoundBytes);
+	free(tdps);
+}
+
+/**
+ * to free the memory used in the decompression
+ * */
+void free_TightDataPointStorageF2(TightDataPointStorageF *tdps)
+{			
+	free(tdps);
+}
diff --git a/thirdparty/SZ/sz/src/TightDataPointStorageI.c b/thirdparty/SZ/sz/src/TightDataPointStorageI.c
new file mode 100644
index 0000000000000000000000000000000000000000..86db1f43157607d8a2971794dc60e40f33c5cdfb
--- /dev/null
+++ b/thirdparty/SZ/sz/src/TightDataPointStorageI.c
@@ -0,0 +1,459 @@
+/**
+ *  @file TightPointDataStorageI.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief The functions used to construct the tightPointDataStorage element for storing compressed bytes.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "Huffman.h"
+//#include "rw.h"
+
+int computeRightShiftBits(int exactByteSize, int dataType)
+{
+	int rightShift = 0; 
+	switch(dataType)
+	{
+	case SZ_INT8:
+	case SZ_UINT8:
+		rightShift = 8 - exactByteSize*8;
+		break;
+	case SZ_INT16:
+	case SZ_UINT16:
+		rightShift = 16 - exactByteSize*8;
+		break;
+	case SZ_INT32:
+	case SZ_UINT32:
+		rightShift = 32 - exactByteSize*8;
+		break;
+	case SZ_INT64:
+	case SZ_UINT64:
+		rightShift = 64 - exactByteSize*8;
+		break;
+	}
+	return rightShift;	
+}
+
+int convertDataTypeSizeCode(int dataTypeSizeCode)
+{
+	int result = 0;
+	switch(dataTypeSizeCode)
+	{
+	case 0:
+		result = 1;
+		break;
+	case 1:
+		result = 2;
+		break;
+	case 2:
+		result = 4;
+		break;
+	case 3:
+		result = 8;
+		break;
+	}
+	return result;	
+}
+
+int convertDataTypeSize(int dataTypeSize)
+{
+	int result = 0;
+	switch(dataTypeSize)
+	{
+	case 1:
+		result = 0; //0000
+		break;
+	case 2:
+		result = 4; //0100
+		break;
+	case 4:
+		result = 8; //1000
+		break;
+	case 8:
+		result = 12; //1100
+		break;
+	}
+	return result;
+}
+
+void new_TightDataPointStorageI_Empty(TightDataPointStorageI **this)
+{
+	*this = (TightDataPointStorageI*)malloc(sizeof(TightDataPointStorageI));
+
+	(*this)->dataSeriesLength = 0;
+	(*this)->allSameData = 0;
+	(*this)->exactDataNum = 0;
+	(*this)->realPrecision = 0;
+	(*this)->minValue = 0;
+	(*this)->exactByteSize = 0;
+
+	(*this)->typeArray = NULL; //its size is dataSeriesLength/4 (or xxx/4+1) 
+	(*this)->typeArray_size = 0;
+	
+	(*this)->exactDataBytes = NULL;
+	(*this)->exactDataBytes_size = 0;
+
+	(*this)->intervals = 0;
+	(*this)->isLossless = 0;	
+}
+
+int new_TightDataPointStorageI_fromFlatBytes(TightDataPointStorageI **this, unsigned char* flatBytes, size_t flatBytesLength)
+{
+	new_TightDataPointStorageI_Empty(this);
+	size_t i, index = 0;
+	char version[3];
+	for (i = 0; i < 3; i++)
+		version[i] = flatBytes[index++]; //3
+	unsigned char sameRByte = flatBytes[index++]; //1
+	if(checkVersion(version)!=1)
+	{
+		//wrong version
+		printf("Wrong version: \nCompressed-data version (%d.%d.%d)\n",version[0], version[1], version[2]);
+		printf("Current sz version: (%d.%d.%d)\n", versionNumber[0], versionNumber[1], versionNumber[2]);
+		printf("Please double-check if the compressed data (or file) is correct.\n");
+		exit(0);
+	}
+	int same = sameRByte & 0x01;
+	//conf_params->szMode = (sameRByte & 0x06)>>1;
+	int dataByteSizeCode = (sameRByte & 0x0C)>>2;
+	convertDataTypeSizeCode(dataByteSizeCode); //in bytes
+	(*this)->isLossless = (sameRByte & 0x10)>>4;
+
+	exe_params->SZ_SIZE_TYPE = ((sameRByte & 0x40)>>6)==1?8:4;
+	int errorBoundMode = ABS;
+	
+	sz_params* params = convertBytesToSZParams(&(flatBytes[index]));
+	if(confparams_dec!=NULL)
+		free(confparams_dec);
+	confparams_dec = params;
+	index += MetaDataByteLength; //20	
+	
+	if(same==0)
+		(*this)->exactByteSize = flatBytes[index++]; //1
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		dsLengthBytes[i] = flatBytes[index++];
+	(*this)->dataSeriesLength = bytesToSize(dsLengthBytes);// ST
+	if((*this)->isLossless==1)
+	{
+		//(*this)->exactMidBytes = flatBytes+8;
+		return errorBoundMode;
+	}
+	else if(same==1)
+	{
+		(*this)->allSameData = 1;
+		size_t exactDataBytesLength = flatBytesLength - 32;//32=3 + 1 + MetaDataByteLength - 8 (initialized SZ_TYPE_LENGTH);
+		if(exactDataBytesLength>0)
+			(*this)->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char)*exactDataBytesLength);
+		else
+			(*this)->exactDataBytes = NULL;
+			
+		for(i = 0;i<exactDataBytesLength;i++)
+			(*this)->exactDataBytes[i] = flatBytes[index++];
+		return errorBoundMode;
+	}
+	else
+		(*this)->allSameData = 0;
+
+	unsigned char byteBuf[8];
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	int max_quant_intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	confparams_dec->maxRangeRadius = max_quant_intervals/2;
+
+	if(errorBoundMode>=PW_REL)
+	{
+		printf("Error: errorBoundMode>=PW_REL in new_TightDataPointStorageI_fromFlatBytes!! Wrong...\n");
+		exit(0);
+	}
+
+	for (i = 0; i < 4; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->intervals = bytesToInt_bigEndian(byteBuf);// 4	
+
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->minValue = bytesToLong_bigEndian(byteBuf); //8
+		
+	for (i = 0; i < 8; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->realPrecision = bytesToDouble(byteBuf);//8
+	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->typeArray_size = bytesToSize(byteBuf);// ST		
+
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactDataNum = bytesToSize(byteBuf);// ST
+	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		byteBuf[i] = flatBytes[index++];
+	(*this)->exactDataBytes_size = bytesToSize(byteBuf);// ST		
+
+
+	(*this)->typeArray = &flatBytes[index];
+	//retrieve the number of states (i.e., stateNum)
+	(*this)->allNodes = bytesToInt_bigEndian((*this)->typeArray); //the first 4 bytes store the stateNum
+	(*this)->stateNum = ((*this)->allNodes+1)/2;		
+
+	index+=(*this)->typeArray_size;
+	
+	if((*this)->exactDataBytes_size > 0)
+	{	
+		(*this)->exactDataBytes = &flatBytes[index];
+		index+=(*this)->exactDataBytes_size*sizeof(char);	
+	}
+	else
+		(*this)->exactDataBytes = NULL;	
+	return errorBoundMode;
+}
+
+/**
+ *
+ * type's length == dataSeriesLength
+ * exactDataBytes's length == exactDataBytes_size
+ * */
+void new_TightDataPointStorageI(TightDataPointStorageI **this,
+		size_t dataSeriesLength, size_t exactDataNum, int byteSize, 
+		int* type, unsigned char* exactDataBytes, size_t exactDataBytes_size,
+		double realPrecision, long minValue, int intervals, int dataType) 
+{
+	//int i = 0;
+	*this = (TightDataPointStorageI *)malloc(sizeof(TightDataPointStorageI));
+	(*this)->allSameData = 0;
+	(*this)->realPrecision = realPrecision;
+	(*this)->minValue = minValue;
+	switch(dataType)
+	{
+	case SZ_INT8:
+	case SZ_UINT8:
+		(*this)->dataTypeSize = 1;
+		break;
+	case SZ_INT16:
+	case SZ_UINT16:
+		(*this)->dataTypeSize = 2;
+		break;
+	case SZ_INT32:
+	case SZ_UINT32:
+		(*this)->dataTypeSize = 4;
+		break;
+	case SZ_INT64:
+	case SZ_UINT64:
+		(*this)->dataTypeSize = 8;
+		break;
+	}
+
+	(*this)->dataSeriesLength = dataSeriesLength;
+	(*this)->exactDataNum = exactDataNum;
+	(*this)->exactByteSize = byteSize;
+
+
+	int stateNum = 2*intervals;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+	encode_withTree(huffmanTree, type, dataSeriesLength, &(*this)->typeArray, &(*this)->typeArray_size);
+	SZ_ReleaseHuffman(huffmanTree);
+		
+	(*this)->exactDataBytes = exactDataBytes;
+	(*this)->exactDataBytes_size = exactDataBytes_size;
+	
+	(*this)->intervals = intervals;
+	
+	(*this)->isLossless = 0;
+}
+
+void convertTDPStoBytes_int(TightDataPointStorageI* tdps, unsigned char* bytes, unsigned char* dsLengthBytes, unsigned char sameByte)
+{
+	size_t i, k = 0;
+	
+	unsigned char byteBuffer[8] = {0,0,0,0,0,0,0,0};
+	
+	for(i = 0;i<3;i++)//3 bytes
+		bytes[k++] = versionNumber[i];
+	bytes[k++] = sameByte;	//1	byte
+	
+	convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+	k = k + MetaDataByteLength;	
+		
+	bytes[k++] = tdps->exactByteSize; //1 byte
+
+	sizeToBytes(byteBuffer, tdps->dataSeriesLength);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST: 4 or 8 bytes
+		bytes[k++] = byteBuffer[i];	
+	
+	intToBytes_bigEndian(byteBuffer, confparams_cpr->max_quant_intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = byteBuffer[i];
+	
+	intToBytes_bigEndian(byteBuffer, tdps->intervals);
+	for(i = 0;i<4;i++)//4
+		bytes[k++] = byteBuffer[i];			
+	
+	longToBytes_bigEndian(byteBuffer, tdps->minValue);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = byteBuffer[i];
+
+	doubleToBytes(byteBuffer, tdps->realPrecision);
+	for (i = 0; i < 8; i++)// 8
+		bytes[k++] = byteBuffer[i];			
+
+	sizeToBytes(byteBuffer, tdps->typeArray_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = byteBuffer[i];
+
+	sizeToBytes(byteBuffer, tdps->exactDataNum);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = byteBuffer[i];
+
+	sizeToBytes(byteBuffer, tdps->exactDataBytes_size);
+	for(i = 0;i<exe_params->SZ_SIZE_TYPE;i++)//ST
+		bytes[k++] = byteBuffer[i];
+
+	memcpy(&(bytes[k]), tdps->typeArray, tdps->typeArray_size);
+	k += tdps->typeArray_size;
+
+	memcpy(&(bytes[k]), tdps->exactDataBytes, tdps->exactDataBytes_size);
+	k += tdps->exactDataBytes_size;
+}
+
+//convert TightDataPointStorageI to bytes...
+void convertTDPStoFlatBytes_int(TightDataPointStorageI *tdps, unsigned char** bytes, size_t *size)
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);
+	
+	int dataTypeSizeCode = convertDataTypeSize(tdps->dataTypeSize);
+	sameByte = (unsigned char) (sameByte | dataTypeSizeCode);
+	
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+	
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactDataBytes_size;
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		for (i = 0; i < 3; i++)//3
+			(*bytes)[k++] = versionNumber[i];
+		(*bytes)[k++] = sameByte;//1
+		
+		convertSZParamsToBytes(confparams_cpr, &((*bytes)[k]));
+		k = k + MetaDataByteLength;			
+		
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+			(*bytes)[k++] = dsLengthBytes[i];
+		
+		for (i = 0; i < tdps->exactDataBytes_size; i++)
+			(*bytes)[k++] = tdps->exactDataBytes[i];
+
+		*size = totalByteLength;
+	}
+	else 
+	{
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			printf("Error: errorBoundMode >= PW_REL!! can't be...\n");
+			exit(0);
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + 1 + exe_params->SZ_SIZE_TYPE + 4 + 4 + 8 + 8
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE
+				+ tdps->typeArray_size + tdps->exactDataBytes_size;
+
+		*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		convertTDPStoBytes_int(tdps, *bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+}
+
+void convertTDPStoFlatBytes_int_args(TightDataPointStorageI *tdps, unsigned char* bytes, size_t *size)
+{
+	size_t i, k = 0; 
+	unsigned char dsLengthBytes[8];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)
+		intToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//4
+	else
+		longToBytes_bigEndian(dsLengthBytes, tdps->dataSeriesLength);//8
+		
+	unsigned char sameByte = tdps->allSameData==1?(unsigned char)1:(unsigned char)0;
+	sameByte = sameByte | (confparams_cpr->szMode << 1);
+	if(tdps->isLossless)
+		sameByte = (unsigned char) (sameByte | 0x10);
+	if(exe_params->SZ_SIZE_TYPE==8)
+		sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+		
+	if(tdps->allSameData==1)
+	{
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + tdps->exactDataBytes_size;
+		//*bytes = (unsigned char *)malloc(sizeof(unsigned char)*totalByteLength);
+
+		for (i = 0; i < 3; i++)//3
+			bytes[k++] = versionNumber[i];
+		bytes[k++] = sameByte;//1
+		
+		convertSZParamsToBytes(confparams_cpr, &(bytes[k]));
+		k = k + MetaDataByteLength;	
+				
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)//ST
+			bytes[k++] = dsLengthBytes[i];		
+		for (i = 0; i < tdps->exactDataBytes_size; i++)
+			bytes[k++] = tdps->exactDataBytes[i];
+
+		*size = totalByteLength;
+	}
+	else
+	{
+		if(confparams_cpr->errorBoundMode>=PW_REL)
+		{			
+			printf("Error: errorBoundMode>=PW_REL!! can't be....\n");
+			exit(0);
+		}
+
+		size_t totalByteLength = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + 4 + 4 + 8 + 8
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE  
+				+ tdps->typeArray_size + tdps->exactDataBytes_size;
+
+		convertTDPStoBytes_int(tdps, bytes, dsLengthBytes, sameByte);
+		
+		*size = totalByteLength;
+	}
+}
+
+void free_TightDataPointStorageI(TightDataPointStorageI *tdps)
+{
+	if(tdps->typeArray!=NULL)
+		free(tdps->typeArray);
+	if(tdps->exactDataBytes!=NULL)
+		free(tdps->exactDataBytes);
+	free(tdps);
+}
+
+void free_TightDataPointStorageI2(TightDataPointStorageI *tdps)
+{
+	free(tdps);
+}
+
+
diff --git a/thirdparty/SZ/sz/src/TypeManager.c b/thirdparty/SZ/sz/src/TypeManager.c
new file mode 100644
index 0000000000000000000000000000000000000000..42474fb00e6f67de9a55a769aea565ebe73e0682
--- /dev/null
+++ b/thirdparty/SZ/sz/src/TypeManager.c
@@ -0,0 +1,431 @@
+/**
+ *  @file TypeManager.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief TypeManager is used to manage the type array: parsing of the bytes and other types in between.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "DynamicByteArray.h"
+#include "sz.h"
+
+//int convertIntArray2ByteArray_fast_8b()
+
+size_t convertIntArray2ByteArray_fast_1b(unsigned char* intArray, size_t intArrayLength, unsigned char **result)
+{
+	size_t byteLength = 0;
+	size_t i, j; 
+	if(intArrayLength%8==0)
+		byteLength = intArrayLength/8;
+	else
+		byteLength = intArrayLength/8+1;
+		
+	if(byteLength>0)
+		*result = (unsigned char*)malloc(byteLength*sizeof(unsigned char));
+	else
+		*result = NULL;
+	size_t n = 0;
+	int tmp, type;
+	for(i = 0;i<byteLength;i++)
+	{
+		tmp = 0;
+		for(j = 0;j<8&&n<intArrayLength;j++)
+		{
+			type = intArray[n];
+			if(type == 1)
+				tmp = (tmp | (1 << (7-j)));
+			n++;
+		}
+    	(*result)[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+	
+void convertByteArray2IntArray_fast_1b(size_t intArrayLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)	
+{
+    if(intArrayLength > byteArrayLength*8)
+    {
+    	printf("Error: intArrayLength > byteArrayLength*8\n");
+    	printf("intArrayLength=%zu, byteArrayLength = %zu", intArrayLength, byteArrayLength);
+    	exit(0);
+    }
+	if(intArrayLength>0)
+		*intArray = (unsigned char*)malloc(intArrayLength*sizeof(unsigned char));
+	else
+		*intArray = NULL;    
+    
+	size_t n = 0, i;
+	int tmp;
+	for (i = 0; i < byteArrayLength-1; i++) 
+	{
+		tmp = byteArray[i];
+		(*intArray)[n++] = (tmp & 0x80) >> 7;
+		(*intArray)[n++] = (tmp & 0x40) >> 6;
+		(*intArray)[n++] = (tmp & 0x20) >> 5;
+		(*intArray)[n++] = (tmp & 0x10) >> 4;
+		(*intArray)[n++] = (tmp & 0x08) >> 3;
+		(*intArray)[n++] = (tmp & 0x04) >> 2;
+		(*intArray)[n++] = (tmp & 0x02) >> 1;
+		(*intArray)[n++] = (tmp & 0x01) >> 0;		
+	}
+	
+	tmp = byteArray[i];	
+	if(n == intArrayLength)
+		return;
+	(*intArray)[n++] = (tmp & 0x80) >> 7;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x40) >> 6;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x20) >> 5;
+	if(n == intArrayLength)
+		return;
+	(*intArray)[n++] = (tmp & 0x10) >> 4;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x08) >> 3;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x04) >> 2;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x02) >> 1;
+	if(n == intArrayLength)
+		return;	
+	(*intArray)[n++] = (tmp & 0x01) >> 0;		
+}
+
+/**
+ * little endian
+ * [01|10|11|00|....]-->[01|10|11|00][....]
+ * @param timeStepType
+ * @return
+ */
+size_t convertIntArray2ByteArray_fast_2b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result)
+{
+	size_t i, j, byteLength = 0;
+	if(timeStepTypeLength%4==0)
+		byteLength = timeStepTypeLength*2/8;
+	else
+		byteLength = timeStepTypeLength*2/8+1;
+	if(byteLength>0)
+		*result = (unsigned char*)malloc(byteLength*sizeof(unsigned char));
+	else
+		*result = NULL;
+	size_t n = 0;
+	for(i = 0;i<byteLength;i++)
+	{
+		int tmp = 0;
+		for(j = 0;j<4&&n<timeStepTypeLength;j++)
+		{
+			int type = timeStepType[n];
+			switch(type)
+			{
+			case 0: 
+				
+				break;
+			case 1:
+				tmp = (tmp | (1 << (6-j*2)));
+				break;
+			case 2:
+				tmp = (tmp | (2 << (6-j*2)));
+				break;
+			case 3:
+				tmp = (tmp | (3 << (6-j*2)));
+				break;
+			default:
+				printf("Error: wrong timestep type...: type[%zu]=%d\n", n, type);
+				exit(0);
+			}
+			n++;
+		}
+		(*result)[i] = (unsigned char)tmp;
+	}
+	return byteLength;
+}
+
+void convertByteArray2IntArray_fast_2b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)
+{
+	if(stepLength > byteArrayLength*4)
+	{
+		printf("Error: stepLength > byteArray.length*4\n");
+		printf("stepLength=%zu, byteArray.length=%zu\n", stepLength, byteArrayLength);
+		exit(0);
+	}
+	if(stepLength>0)
+		*intArray = (unsigned char*)malloc(stepLength*sizeof(unsigned char));
+	else
+		*intArray = NULL;
+	size_t i, n = 0;
+
+	for (i = 0; i < byteArrayLength; i++) {
+		unsigned char tmp = byteArray[i];
+		(*intArray)[n++] = (tmp & 0xC0) >> 6;
+		if(n==stepLength)
+			break;
+		(*intArray)[n++] = (tmp & 0x30) >> 4;
+		if(n==stepLength)
+			break;
+		(*intArray)[n++] = (tmp & 0x0C) >> 2;
+		if(n==stepLength)
+			break;
+		(*intArray)[n++] = tmp & 0x03;
+		if(n==stepLength)
+			break;
+	}
+}
+
+size_t convertIntArray2ByteArray_fast_3b(unsigned char* timeStepType, size_t timeStepTypeLength, unsigned char **result)
+{	
+	size_t i = 0, k = 0, byteLength = 0, n = 0;
+	if(timeStepTypeLength%8==0)
+		byteLength = timeStepTypeLength*3/8;
+	else
+		byteLength = timeStepTypeLength*3/8+1;
+
+	if(byteLength>0)
+		*result = (unsigned char*)malloc(byteLength*sizeof(unsigned char));
+	else
+		*result = NULL;
+	int tmp = 0;
+	for(n = 0;n<timeStepTypeLength;n++)
+	{
+		k = n%8;
+		switch(k)
+		{
+		case 0:
+			tmp = tmp | (timeStepType[n] << 5);
+			break;
+		case 1:
+			tmp = tmp | (timeStepType[n] << 2);
+			break;
+		case 2: 
+			tmp = tmp | (timeStepType[n] >> 1);
+			(*result)[i++] = (unsigned char)tmp;
+			tmp = 0 | (timeStepType[n] << 7);
+			break;
+		case 3:
+			tmp = tmp | (timeStepType[n] << 4);
+			break;
+		case 4:
+			tmp = tmp | (timeStepType[n] << 1);
+			break;
+		case 5:
+			tmp = tmp | (timeStepType[n] >> 2);
+			(*result)[i++] = (unsigned char)tmp;
+			tmp = 0 | (timeStepType[n] << 6);
+			break;
+		case 6:
+			tmp = tmp | (timeStepType[n] << 3);
+			break;
+		case 7:
+			tmp = tmp | (timeStepType[n] << 0);
+			(*result)[i++] = (unsigned char)tmp;
+			tmp = 0;
+			break;
+		}
+	}
+	if(k!=7) //load the last one
+		(*result)[i] = (unsigned char)tmp;
+	
+	return byteLength;
+}
+
+void convertByteArray2IntArray_fast_3b(size_t stepLength, unsigned char* byteArray, size_t byteArrayLength, unsigned char **intArray)
+{	
+	if(stepLength > byteArrayLength*8/3)
+	{
+		printf("Error: stepLength > byteArray.length*8/3, impossible case unless bugs elsewhere.\n");
+		printf("stepLength=%zu, byteArray.length=%zu\n", stepLength, byteArrayLength);
+		exit(0);		
+	}
+	if(stepLength>0)
+		*intArray = (unsigned char*)malloc(stepLength*sizeof(unsigned char));
+	else
+		*intArray = NULL;
+	size_t i = 0, ii = 0, n = 0;
+	unsigned char tmp = byteArray[i];	
+	for(n=0;n<stepLength;)
+	{
+		switch(n%8)
+		{
+		case 0:
+			(*intArray)[n++] = (tmp & 0xE0) >> 5;
+			break;
+		case 1: 
+			(*intArray)[n++] = (tmp & 0x1C) >> 2;
+			break;
+		case 2:
+			ii = (tmp & 0x03) << 1;
+			i++;
+			tmp = byteArray[i];
+			ii |= (tmp & 0x80) >> 7;
+			(*intArray)[n++] = ii;
+			break;
+		case 3:
+			(*intArray)[n++] = (tmp & 0x70) >> 4;
+			break;
+		case 4:
+			(*intArray)[n++] = (tmp & 0x0E) >> 1;
+			break;
+		case 5:
+			ii = (tmp & 0x01) << 2;
+			i++;
+			tmp = byteArray[i];
+			ii |= (tmp & 0xC0) >> 6;
+			(*intArray)[n++] = ii;
+			break;
+		case 6: 
+			(*intArray)[n++] = (tmp & 0x38) >> 3;
+			break;
+		case 7:
+			(*intArray)[n++] = (tmp & 0x07);
+			i++;
+			tmp = byteArray[i];
+			break;
+		}
+	}
+}
+
+int getLeftMovingSteps(size_t k, unsigned char resiBitLength)
+{
+	return 8 - k%8 - resiBitLength;
+}
+
+/**
+ * 
+ * @param timeStepType is the resiMidBits
+ * @param resiBitLength is the length of resiMidBits for each element, (the number of resiBitLength == the # of unpredictable elements
+ * @return
+ */
+size_t convertIntArray2ByteArray_fast_dynamic(unsigned char* timeStepType, unsigned char resiBitLength, size_t nbEle, unsigned char **bytes)
+{
+	size_t i = 0, j = 0, k = 0; 
+	int value;
+	DynamicByteArray* dba;
+	new_DBA(&dba, 1024);
+	int tmp = 0, leftMovSteps = 0;
+	for(j = 0;j<nbEle;j++)
+	{
+		if(resiBitLength==0)
+			continue;
+		value = timeStepType[i];
+		leftMovSteps = getLeftMovingSteps(k, resiBitLength);
+		if(leftMovSteps < 0)
+		{
+			tmp = tmp | (value >> (-leftMovSteps));
+			addDBA_Data(dba, (unsigned char)tmp);
+			tmp = 0 | (value << (8+leftMovSteps));
+		}
+		else if(leftMovSteps > 0)
+		{
+			tmp = tmp | (value << leftMovSteps);
+		}
+		else //==0
+		{
+			tmp = tmp | value;
+			addDBA_Data(dba, (unsigned char)tmp);
+			tmp = 0;
+		}
+		i++;
+		k += resiBitLength;
+	}
+	if(leftMovSteps != 0)
+		addDBA_Data(dba, (unsigned char)tmp);
+	convertDBAtoBytes(dba, bytes);
+	size_t size = dba->size;
+	free_DBA(dba);
+	return size;
+}
+
+/**
+ * 
+ * @param timeStepType is the resiMidBits
+ * @param resiBitLength is the length of resiMidBits for each element, (the number of resiBitLength == the # of unpredictable elements
+ * @return
+ */
+size_t convertIntArray2ByteArray_fast_dynamic2(unsigned char* timeStepType, unsigned char* resiBitLength, size_t resiBitLengthLength, unsigned char **bytes)
+{
+	size_t i = 0, j = 0, k = 0; 
+	int value;
+	DynamicByteArray* dba;
+	new_DBA(&dba, 1024);
+	int tmp = 0, leftMovSteps = 0;
+	for(j = 0;j<resiBitLengthLength;j++)
+	{
+		unsigned char rbl = resiBitLength[j];
+		if(rbl==0)
+			continue;
+		value = timeStepType[i];
+		leftMovSteps = getLeftMovingSteps(k, rbl);
+		if(leftMovSteps < 0)
+		{
+			tmp = tmp | (value >> (-leftMovSteps));
+			addDBA_Data(dba, (unsigned char)tmp);
+			tmp = 0 | (value << (8+leftMovSteps));
+		}
+		else if(leftMovSteps > 0)
+		{
+			tmp = tmp | (value << leftMovSteps);
+		}
+		else //==0
+		{
+			tmp = tmp | value;
+			addDBA_Data(dba, (unsigned char)tmp);
+			tmp = 0;
+		}
+		i++;
+		k += rbl;
+	}
+	if(leftMovSteps != 0)
+		addDBA_Data(dba, (unsigned char)tmp);
+	convertDBAtoBytes(dba, bytes);
+	size_t size = dba->size;
+	free_DBA(dba);
+	return size;
+}
+
+int computeBitNumRequired(size_t dataLength)
+{
+	if(exe_params->SZ_SIZE_TYPE==4)
+		return 32 - numberOfLeadingZeros_Int(dataLength);
+	else
+		return 64 - numberOfLeadingZeros_Long(dataLength);
+		
+}
+
+void decompressBitArraybySimpleLZ77(int** result, unsigned char* bytes, size_t bytesLength, size_t totalLength, int validLength)
+{
+	size_t pairLength = (bytesLength*8)/(validLength+1);
+	size_t tmpLength = pairLength*2;
+	int tmpResult[tmpLength];
+	size_t i, j, k = 0;
+	for(i = 0;i<tmpLength;i+=2)
+	{
+		size_t outIndex = k/8;
+		int innerIndex = k%8;
+
+		unsigned char curByte = bytes[outIndex];
+		tmpResult[i] = (curByte >> (8-1-innerIndex)) & 0x01;
+		k++;
+		
+		int numResult = extractBytes(bytes, k, validLength);
+		
+		tmpResult[i+1] = numResult;
+		k = k + validLength;
+	}
+	
+	*result = (int*)malloc(sizeof(int)*totalLength);
+	k = 0;
+	for(i = 0;i<tmpLength;i=i+2)
+	{
+		int state = tmpResult[i];
+		int num = tmpResult[i+1];
+		for(j = 0;j<num;j++)
+			(*result)[k++] = state;
+	}
+}
diff --git a/thirdparty/SZ/sz/src/VarSet.c b/thirdparty/SZ/sz/src/VarSet.c
new file mode 100644
index 0000000000000000000000000000000000000000..ea0db462c51735e0017fb61667905f305c364512
--- /dev/null
+++ b/thirdparty/SZ/sz/src/VarSet.c
@@ -0,0 +1,197 @@
+/**
+ *  @file Variable.c
+ *  @author Sheng Di
+ *  @date July, 2016
+ *  @brief TypeManager is used to manage the type array: parsing of the bytes and other types in between.
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "VarSet.h"
+#include "sz.h"
+
+void free_Variable_keepOriginalData(SZ_Variable* v)
+{
+	if(v->varName!=NULL)
+		free(v->varName);	
+	if(v->data!=NULL)
+		free(v->compressedBytes);
+	
+	free(v);
+}
+
+/**
+ * 
+ * @deprecated
+ * */
+void free_Variable_keepCompressedBytes(SZ_Variable* v)
+{
+	if(v->varName!=NULL)
+		free(v->varName);
+	if(v->data!=NULL)
+		free(v->data);
+	if(v->multisteps!=NULL)
+		free_multisteps(v->multisteps);	
+	free(v);
+}
+
+void free_Variable_all(SZ_Variable* v)
+{
+	if(v->varName!=NULL)
+		free(v->varName);
+	if(v->data!=NULL)
+		free(v->data);
+	if(v->compressedBytes!=NULL)
+		free(v->compressedBytes);
+	if(v->multisteps!=NULL)
+		free_multisteps(v->multisteps);
+	free(v);
+}
+
+void SZ_batchAddVar(char* varName, int dataType, void* data, 
+			int errBoundMode, double absErrBound, double relBoundRatio, double pwRelBoundRatio, 
+			size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{	
+	if(sz_varset==NULL)
+	{
+		sz_varset = (SZ_VarSet*)malloc(sizeof(SZ_VarSet));
+		sz_varset->header = (SZ_Variable*)malloc(sizeof(SZ_Variable));
+		sz_varset->header->next = NULL;
+		sz_varset->lastVar = sz_varset->header;
+		sz_varset->count = 0;		
+	}
+	
+	SZ_Variable* var = (SZ_Variable*)malloc(sizeof(SZ_Variable));
+	memset(var, 0, sizeof(SZ_Variable));
+	
+	var->varName = (char*)malloc(strlen(varName)+1);
+	memcpy(var->varName, varName, strlen(varName)+1);
+	//var->varName = varName;
+	var->dataType = dataType;
+	var->r5 = r5;
+	var->r4 = r4;
+	var->r3 = r3;
+	var->r2 = r2;
+	var->r1 = r1;
+	var->errBoundMode = errBoundMode;
+	var->absErrBound = absErrBound;
+	var->relBoundRatio = relBoundRatio;
+	var->pwRelBoundRatio = pwRelBoundRatio;
+	var->data = data;
+	
+	var->multisteps = (sz_multisteps*)malloc(sizeof(sz_multisteps));
+	memset(var->multisteps, 0, sizeof(sz_multisteps));
+	
+	size_t dataLen = computeDataLength(r5, r4, r3, r2, r1);
+	if(dataType==SZ_FLOAT)
+	{
+		var->multisteps->hist_data = (float*)malloc(sizeof(float)*dataLen);
+		memset(var->multisteps->hist_data, 0, sizeof(float)*dataLen);
+	}
+	else if(dataType==SZ_DOUBLE)
+	{
+		var->multisteps->hist_data = (double*)malloc(sizeof(double)*dataLen);
+		memset(var->multisteps->hist_data, 0, sizeof(double)*dataLen);
+	}
+	var->compressedBytes = NULL;
+	var->next = NULL;
+	
+	sz_varset->count ++;
+	sz_varset->lastVar->next = var;
+	sz_varset->lastVar = var;
+}
+
+int SZ_batchDelVar(char* varName)
+{
+	int state = SZ_batchDelVar_vset(sz_varset, varName);
+	return state;
+}
+
+int SZ_batchDelVar_vset(SZ_VarSet* vset, char* varName)
+{
+	int delSuccess = SZ_NSCS;
+	SZ_Variable* p = vset->header;
+	SZ_Variable* q = p->next;
+	while(q != NULL)
+	{
+		int cmpResult = strcmp(q->varName, varName);
+		if(cmpResult==0)
+		{
+			p->next = q->next;
+			free_Variable_all(q);
+			vset->count --;
+			delSuccess = SZ_SCES;
+			break;
+		}
+		p = p->next;
+		q = q->next;	
+	}
+	
+	return delSuccess;
+}
+
+SZ_Variable* SZ_searchVar(char* varName)
+{
+	SZ_Variable* p = sz_varset->header->next;
+	while(p!=NULL)
+	{
+		int checkName = strcmp(p->varName, varName);
+		if(checkName==0)
+			return p;
+		p = p->next;
+	}	
+	return NULL;
+}
+
+void* SZ_getVarData(char* varName, size_t *r5, size_t *r4, size_t *r3, size_t *r2, size_t *r1)
+{
+	SZ_Variable* v = SZ_searchVar(varName);
+	*r5 = v->r5;
+	*r4 = v->r4;
+	*r3 = v->r3;
+	*r2 = v->r2;
+	*r1 = v->r1;
+	return (void*)v->data;
+}
+
+/**
+ * 
+ * int mode: SZ_MAINTAIN_VAR_DATA, Z_DESTROY_WHOLE_VARSET
+ * */
+void SZ_freeVarSet(int mode)
+{
+	free_VarSet_vset(sz_varset, mode);
+}
+
+//free_VarSet will completely destroy the SZ_VarSet, so don't do it until you really don't need it any more!
+/**
+ * 
+ * int mode: SZ_MAINTAIN_VAR_DATA, Z_DESTROY_WHOLE_VARSET
+ * */
+void free_VarSet_vset(SZ_VarSet *vset, int mode)
+{
+	if(vset==NULL)
+		return;
+	SZ_Variable *p = vset->header;
+	while(p->next!=NULL)
+	{
+		SZ_Variable *q = p->next;
+		p->next = q->next;
+		if(mode==SZ_MAINTAIN_VAR_DATA)
+			free_Variable_keepOriginalData(q);
+		else if(mode==SZ_DESTROY_WHOLE_VARSET)
+			free_Variable_all(q);
+	}
+	free(sz_varset->header);
+	free(vset);
+}
+
+void free_multisteps(sz_multisteps* multisteps)
+{
+	if(multisteps->hist_data!=NULL)
+		free(multisteps->hist_data);
+	free(multisteps);
+}
diff --git a/thirdparty/SZ/sz/src/callZlib.c b/thirdparty/SZ/sz/src/callZlib.c
new file mode 100644
index 0000000000000000000000000000000000000000..0e392b791564b619fa96d60e678b8a6b1888d225
--- /dev/null
+++ b/thirdparty/SZ/sz/src/callZlib.c
@@ -0,0 +1,505 @@
+/**
+ *  @file callZlib.c
+ *  @author Sheng Di
+ *  @date June, 2016
+ *  @brief gzip compressor code: the interface to call zlib
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <zlib.h>
+#include <sz.h>
+
+#if MAX_MEM_LEVEL >= 8
+#define DEF_MEM_LEVEL 8
+#else
+#define DEF_MEM_LEVEL MAX_MEM_LEVEL
+#endif
+
+
+#define CHECK_ERR(err, msg) { \
+    if (err != Z_OK && err != Z_STREAM_END) { \
+        fprintf(stderr, "%s error: %d\n", msg, err); \
+        return SZ_NSCS; \
+    } \
+}
+
+/*zlib_compress() is only valid for median-size data compression. */
+unsigned long zlib_compress(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level)
+{	
+	z_stream stream = {0};
+
+    stream.next_in = data;
+    stream.avail_in = dataLength;
+#ifdef MAXSEG_64K
+    /* Check for source > 64K on 16-bit machine: */
+    if ((uLong)stream.avail_in != dataLength) return Z_BUF_ERROR;
+#endif
+
+    uLong estCmpLen = deflateBound(&stream, dataLength);	
+	unsigned long outSize = estCmpLen;
+    	
+	*compressBytes = (unsigned char*)malloc(sizeof(unsigned char)*estCmpLen);
+	int err = compress2(*compressBytes, &outSize, data, dataLength, level);
+	if(err!=Z_OK)
+	{
+		printf("Error: err_code=%d; the reason may be your data size is too large (>=2^32), which cannot be compressed by standalone zlib_compress. Sol: inflace_init, ....\n", err);
+		exit(0);
+	}
+	return outSize;
+}
+
+unsigned long zlib_compress2(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level)
+{
+	unsigned long outSize;
+	
+	z_stream stream = {0};
+    int err;
+
+    stream.next_in = data;
+    stream.avail_in = dataLength;
+#ifdef MAXSEG_64K
+    /* Check for source > 64K on 16-bit machine: */
+    if ((uLong)stream.avail_in != dataLength) return Z_BUF_ERROR;
+#endif
+
+    uLong estCmpLen = deflateBound(&stream, dataLength);
+	*compressBytes = (unsigned char*)malloc(sizeof(unsigned char)*estCmpLen);
+
+    stream.next_out = *compressBytes;
+    stream.avail_out = estCmpLen;
+    //stream.avail_out = dataLength*10;
+    //if ((uLong)stream.avail_out != dataLength*10) return Z_BUF_ERROR;
+
+    stream.zalloc = (alloc_func)0;
+    stream.zfree = (free_func)0;
+    stream.opaque = (voidpf)0;
+//	stream.data_type = Z_TEXT;
+
+    //err = deflateInit(&stream, level); //default  windowBits == 15.
+    int windowBits = 14; //8-15
+    if(confparams_cpr->szMode==SZ_BEST_COMPRESSION)
+		windowBits = 15;
+	
+    err = deflateInit2(&stream, level, Z_DEFLATED, windowBits, DEF_MEM_LEVEL,
+                         Z_DEFAULT_STRATEGY);//Z_FIXED); //Z_DEFAULT_STRATEGY
+    if (err != Z_OK) return err;
+
+    err = deflate(&stream, Z_FINISH);
+    if (err != Z_STREAM_END) {
+        deflateEnd(&stream);
+        return err == Z_OK ? Z_BUF_ERROR : err;
+    }
+
+    err = deflateEnd(&stream);
+    
+    outSize = stream.total_out;
+    return outSize;
+}
+
+unsigned long zlib_compress3(unsigned char* data, unsigned long dataLength, unsigned char* compressBytes, int level)
+{
+	unsigned long outSize = 0;
+
+	z_stream stream = {0};
+    int err;
+
+    stream.next_in = data;
+    stream.avail_in = dataLength;
+#ifdef MAXSEG_64K
+    /* Check for source > 64K on 16-bit machine: */
+    if ((uLong)stream.avail_in != dataLength) return Z_BUF_ERROR;
+#endif
+
+    stream.next_out = compressBytes;
+    stream.avail_out = dataLength;
+    stream.zalloc = (alloc_func)0;
+    stream.zfree = (free_func)0;
+    stream.opaque = (voidpf)0;
+
+    //err = deflateInit(&stream, level); //default  windowBits == 15.
+    int windowBits = 14; //8-15
+    if(confparams_cpr->szMode==SZ_BEST_COMPRESSION)
+		windowBits = 15;
+
+    err = deflateInit2(&stream, level, Z_DEFLATED, windowBits, DEF_MEM_LEVEL,
+                         Z_DEFAULT_STRATEGY);//Z_FIXED); //Z_DEFAULT_STRATEGY
+    if (err != Z_OK) return err;
+
+    err = deflate(&stream, Z_FINISH);
+    if (err != Z_STREAM_END) {
+        deflateEnd(&stream);
+        return err == Z_OK ? Z_BUF_ERROR : err;
+    }
+
+    err = deflateEnd(&stream);
+
+    outSize = stream.total_out;
+    return outSize;
+}
+
+unsigned long zlib_compress4(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level)
+{
+    z_stream c_stream = {0}; /* compression stream */
+    int err = 0;
+
+    c_stream.zalloc = (alloc_func)0;
+    c_stream.zfree = (free_func)0;
+    c_stream.opaque = (voidpf)0;
+
+    int windowBits = 14; //8-15
+    if(confparams_cpr->szMode==SZ_BEST_COMPRESSION)
+		windowBits = 15;
+    
+    err = deflateInit2(&c_stream, level, Z_DEFLATED, windowBits, DEF_MEM_LEVEL,
+                         Z_DEFAULT_STRATEGY);//Z_FIXED); //Z_DEFAULT_STRATEGY
+    CHECK_ERR(err, "deflateInit");
+
+    uLong estCmpLen = deflateBound(&c_stream, dataLength);
+	*compressBytes = (unsigned char*)malloc(sizeof(unsigned char)*estCmpLen);	
+
+    c_stream.next_in  = data;
+    c_stream.next_out = *compressBytes;
+
+    while (c_stream.total_in < dataLength && c_stream.total_out < estCmpLen) {
+        c_stream.avail_in = c_stream.avail_out = SZ_ZLIB_BUFFER_SIZE; /* force small buffers */
+        err = deflate(&c_stream, Z_NO_FLUSH);
+        CHECK_ERR(err, "deflate");
+    }
+    /* Finish the stream, still forcing small buffers: */
+    for (;;) {
+        c_stream.avail_out = 1;
+        err = deflate(&c_stream, Z_FINISH);
+        if (err == Z_STREAM_END) break;
+        CHECK_ERR(err, "deflate");
+    }
+
+    err = deflateEnd(&c_stream);
+    CHECK_ERR(err, "deflateEnd");
+    
+    return c_stream.total_out;	
+}
+
+unsigned long zlib_compress5(unsigned char* data, unsigned long dataLength, unsigned char** compressBytes, int level)
+{
+	int ret, flush;
+	unsigned have;
+	z_stream strm;
+	unsigned char* in = data;
+
+	/* allocate deflate state */
+	strm.zalloc = Z_NULL;
+	strm.zfree = Z_NULL;
+	strm.opaque = Z_NULL;
+	ret = deflateInit(&strm, level);
+	if (ret != Z_OK)
+		return ret;
+
+	size_t p_size = 0, av_in = 0;
+    uLong estCmpLen = deflateBound(&strm, dataLength);
+   	*compressBytes = (unsigned char*)malloc(sizeof(unsigned char)*estCmpLen);	
+	unsigned char* out = *compressBytes; 
+
+	/* compress until end of file */
+	do {		
+		p_size += SZ_ZLIB_BUFFER_SIZE;
+		if(p_size>=dataLength)
+		{
+			av_in = dataLength - (p_size - SZ_ZLIB_BUFFER_SIZE);
+			flush = Z_FINISH;
+		}
+		else
+		{
+			av_in = SZ_ZLIB_BUFFER_SIZE;
+			flush = Z_NO_FLUSH;
+		}
+		strm.avail_in = av_in;
+		strm.next_in = in;
+
+		/* run deflate() on input until output buffer not full, finish
+		   compression if all of source has been read in */
+		do {
+			strm.avail_out = SZ_ZLIB_BUFFER_SIZE;
+			strm.next_out = out;
+			ret = deflate(&strm, flush);    /* no bad return value */
+
+			have = SZ_ZLIB_BUFFER_SIZE - strm.avail_out;
+			out += have;
+		} while (strm.avail_out == 0);
+
+		in+=av_in;
+
+		/* done when last data in file processed */
+	} while (flush != Z_FINISH);
+
+	/* clean up and return */
+	(void)deflateEnd(&strm);	
+	
+	return strm.total_out;	
+}
+
+unsigned long zlib_uncompress(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+	unsigned long outSize = targetOriSize;
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);	
+	int status = uncompress(*oriData, &outSize, compressBytes, cmpSize); 
+	if(status!=Z_OK)
+	{
+		printf("Error: Zlib decompression error; status=%d\n", status);
+		exit(0);
+	}
+	
+	return outSize;
+}
+
+unsigned long zlib_uncompress2 (unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+    z_stream stream = {0};
+
+	unsigned long outSize;
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);
+
+    stream.zalloc = Z_NULL;
+    stream.zfree = Z_NULL;
+    stream.opaque = Z_NULL;
+//	stream.data_type = Z_TEXT;
+
+    stream.next_in = compressBytes;
+    stream.avail_in = cmpSize;
+    /* Check for source > 64K on 16-bit machine: */
+    if ((unsigned long)stream.avail_in != cmpSize) 
+    {
+		printf("Error: zlib_uncompress2: stream.avail_in != cmpSize");
+		//exit(1);
+		return SZ_NSCS; //-1
+	}
+
+    stream.next_out = *oriData;
+    stream.avail_out = targetOriSize;
+    //if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR;
+
+    int err = inflateInit(&stream);
+    //int windowBits = 15;
+    //int err = inflateInit2(&stream, windowBits);
+    if (err != Z_OK)
+    {
+		printf("Error: zlib_uncompress2: err != Z_OK\n");
+		return SZ_NSCS;
+	}
+
+    err = inflate(&stream, Z_FINISH);
+    if (err != Z_STREAM_END) {
+        inflateEnd(&stream);
+        if (err == Z_NEED_DICT || (err == Z_BUF_ERROR && stream.avail_in == 0))
+            return Z_DATA_ERROR;
+        return err;
+    }
+    outSize = stream.total_out;
+    inflateEnd(&stream);
+    return outSize;
+}
+
+unsigned long zlib_uncompress3(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+	int status;
+	z_stream z_strm; /* decompression stream */
+	
+	size_t nalloc = 65536*4;
+
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);		
+	memset(&z_strm, 0, sizeof(z_strm));
+
+
+    /*d_stream.zalloc = (alloc_func)0;
+    d_stream.zfree = (free_func)0;
+    d_stream.opaque = (voidpf)0;*/
+
+	z_strm.next_in  = compressBytes;
+	z_strm.avail_in = 0;
+	z_strm.next_out = *oriData;
+	z_strm.avail_out = targetOriSize;
+	
+	status = inflateInit(&z_strm);
+	CHECK_ERR(status, "inflateInit");
+	
+	do{
+		z_strm.avail_in = z_strm.avail_out = SZ_ZLIB_BUFFER_SIZE; /* force small buffers */		
+		/* Uncompress some data */
+		status = inflate(&z_strm, Z_SYNC_FLUSH);
+		
+		/* Check if we are done uncompressing data */
+		if (Z_STREAM_END==status)
+			break;  /*done*/				
+
+		if (Z_OK!=status) {
+			(void)inflateEnd(&z_strm);
+			printf("Error: inflate() failed\n");
+			exit(0);
+		}	
+		else
+		{
+			/* If we're not done and just ran out of buffer space, get more */
+			if(0 == z_strm.avail_out) {
+				void *new_outbuf;         /* Pointer to new output buffer */
+
+				/* Allocate a buffer twice as big */
+				nalloc *= 2;
+				if(NULL == (new_outbuf = realloc(*oriData, nalloc))) {
+					(void)inflateEnd(&z_strm);
+					printf("Error: memory allocation failed for deflate uncompression\n");
+					exit(0);
+				} /* end if */
+				*oriData = new_outbuf;
+
+				/* Update pointers to buffer for next set of uncompressed data */
+				z_strm.next_out = (*oriData) + z_strm.total_out;
+				z_strm.avail_out = (uInt)(nalloc - z_strm.total_out);
+			} /* end if */			
+		} /* end else*/
+	}while(status==Z_OK);
+
+	status = inflateEnd(&z_strm);
+	CHECK_ERR(status, "inflateEnd");
+
+	return z_strm.total_out;
+}
+
+unsigned long zlib_uncompress4(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+    int ret;
+    unsigned int have;
+    z_stream strm;
+    unsigned char *in = compressBytes;
+    unsigned char *out;
+
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);		
+	out = *oriData;
+
+    /* allocate inflate state */
+    strm.zalloc = Z_NULL;
+    strm.zfree = Z_NULL;
+    strm.opaque = Z_NULL;
+    strm.avail_in = 0;
+    strm.next_in = Z_NULL;
+    ret = inflateInit(&strm);
+    if (ret != Z_OK)
+	{
+        return ret;
+	}
+
+	size_t p_size = 0, av_in = 0;
+    /* decompress until deflate stream ends or end of file */
+    do {
+		p_size += SZ_ZLIB_BUFFER_SIZE;
+		if(p_size>cmpSize)
+			av_in = cmpSize - (p_size - SZ_ZLIB_BUFFER_SIZE);
+		else
+			av_in = SZ_ZLIB_BUFFER_SIZE;
+		strm.avail_in = av_in;
+        
+        if (strm.avail_in == 0)
+            break;
+        strm.next_in = in;
+
+        /* run inflate() on input until output buffer not full */
+        do {
+            strm.avail_out = SZ_ZLIB_BUFFER_SIZE;
+            strm.next_out = out;
+            ret = inflate(&strm, Z_NO_FLUSH);
+            //assert(ret != Z_STREAM_ERROR);  /* state not clobbered */
+            switch (ret) {
+            case Z_NEED_DICT:
+                ret = Z_DATA_ERROR;     /* and fall through */
+            case Z_DATA_ERROR:
+            case Z_MEM_ERROR:
+                (void)inflateEnd(&strm);
+                return ret;
+            }
+            have = SZ_ZLIB_BUFFER_SIZE - strm.avail_out;
+            
+            out += have;
+
+        } while (strm.avail_out == 0);
+		
+		in+=av_in;
+        /* done when inflate() says it's done */
+    } while (ret != Z_STREAM_END);
+
+    /* clean up and return */
+    (void)inflateEnd(&strm);
+    
+    return strm.total_out;	
+}
+
+unsigned long zlib_uncompress65536bytes(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData)
+{
+	int err;
+	unsigned long targetOriSize = 65536;
+	z_stream d_stream = {0}; /* decompression stream */
+
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);
+
+    d_stream.zalloc = (alloc_func)0;
+    d_stream.zfree = (free_func)0;
+    d_stream.opaque = (voidpf)0;
+
+	d_stream.next_in  = compressBytes;
+	d_stream.avail_in = 0;
+	d_stream.next_out = *oriData;
+
+	err = inflateInit(&d_stream);
+	CHECK_ERR(err, "inflateInit");
+
+	while (d_stream.total_out < targetOriSize && d_stream.total_in < cmpSize) {
+		d_stream.avail_in = d_stream.avail_out = SZ_ZLIB_BUFFER_SIZE; /* force small buffers */
+		//err = inflate(&d_stream, Z_NO_FLUSH);
+		err = inflate(&d_stream, Z_SYNC_FLUSH);
+		if (err == Z_STREAM_END) break;
+		if(err<0)
+			break;
+	}
+	
+	if(err<0)
+		return d_stream.total_out;
+	err = inflateEnd(&d_stream);
+	
+	CHECK_ERR(err, "inflateEnd");
+
+	return d_stream.total_out;
+}
+
+unsigned long zlib_uncompress5(unsigned char* compressBytes, unsigned long cmpSize, unsigned char** oriData, unsigned long targetOriSize)
+{
+	int err;
+	z_stream d_stream = {0}; /* decompression stream */
+
+	*oriData = (unsigned char*)malloc(sizeof(unsigned char)*targetOriSize);		
+
+    d_stream.zalloc = (alloc_func)0;
+    d_stream.zfree = (free_func)0;
+    d_stream.opaque = (voidpf)0;
+
+	d_stream.next_in  = compressBytes;
+	d_stream.avail_in = 0;
+	d_stream.next_out = *oriData;
+
+	err = inflateInit(&d_stream);
+	CHECK_ERR(err, "inflateInit");
+
+	while (d_stream.total_out < targetOriSize && d_stream.total_in < cmpSize) {
+		d_stream.avail_in = d_stream.avail_out = SZ_ZLIB_BUFFER_SIZE; /* force small buffers */
+		//err = inflate(&d_stream, Z_NO_FLUSH);
+		err = inflate(&d_stream, Z_SYNC_FLUSH);
+		if (err == Z_STREAM_END) break;
+		CHECK_ERR(err, "inflate");
+	}
+	
+	err = inflateEnd(&d_stream);
+	
+	CHECK_ERR(err, "inflateEnd");
+
+	return d_stream.total_out;
+}
diff --git a/thirdparty/SZ/sz/src/conf.c b/thirdparty/SZ/sz/src/conf.c
new file mode 100644
index 0000000000000000000000000000000000000000..8e6959d70f68bff12fae174da79c3cdc5df65638
--- /dev/null
+++ b/thirdparty/SZ/sz/src/conf.c
@@ -0,0 +1,362 @@
+/**
+ *  @file   conf.c
+ *  @author Sheng Di (sdi1@anl.gov or disheng222@gmail.com)
+ *  @date   2015.
+ *  @brief  Configuration loading functions for the SZ library.
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <math.h>
+#include "string.h"
+#include "sz.h"
+#include "iniparser.h"
+#include "Huffman.h"
+#include "pastri.h"
+
+/*-------------------------------------------------------------------------*/
+/**
+    @brief      It reads the configuration given in the configuration file.
+    @return     integer         1 if successfull.
+
+    This function reads the configuration given in the SZ configuration
+    file and sets other required parameters.
+
+ **/
+ 
+/*struct node_t *pool;
+node *qqq;
+node *qq;
+int n_nodes = 0, qend;
+unsigned long **code;
+unsigned char *cout;
+int n_inode;*/ 
+ 
+unsigned int roundUpToPowerOf2(unsigned int base)
+{
+  base -= 1;
+
+  base = base | (base >> 1);
+  base = base | (base >> 2);
+  base = base | (base >> 4);
+  base = base | (base >> 8);
+  base = base | (base >> 16);
+
+  return base + 1;
+} 
+ 
+void updateQuantizationInfo(int quant_intervals)
+{
+	exe_params->intvCapacity = quant_intervals;
+	exe_params->intvRadius = quant_intervals/2;
+} 
+ 
+double computeABSErrBoundFromPSNR(double psnr, double threshold, double value_range)
+{
+	double v1 = psnr + 10 * log10(1-2.0/3.0*threshold);
+	double v2 = v1/(-20);
+	double v3 = pow(10, v2);
+	return value_range * v3;
+} 
+ 
+/*-------------------------------------------------------------------------*/
+/**
+ * 
+ * 
+ * @return the status of loading conf. file: 1 (success) or 0 (error code);
+ * */
+int SZ_ReadConf(const char* sz_cfgFile) {
+    // Check access to SZ configuration file and load dictionary
+    //record the setting in confparams_cpr
+    confparams_cpr = (sz_params*)malloc(sizeof(sz_params));    
+    exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+    
+    int x = 1;
+    char sol_name[256];
+    char *modeBuf;
+    char *errBoundMode;
+    char *endianTypeString;
+    dictionary *ini;
+    char *par;
+
+	char *y = (char*)&x;
+	
+	if(*y==1)
+		sysEndianType = LITTLE_ENDIAN_SYSTEM;
+	else //=0
+		sysEndianType = BIG_ENDIAN_SYSTEM;
+    
+    if(sz_cfgFile == NULL)
+    {
+		dataEndianType = LITTLE_ENDIAN_DATA;
+		confparams_cpr->sol_ID = SZ;
+		confparams_cpr->max_quant_intervals = 65536;
+		confparams_cpr->maxRangeRadius = confparams_cpr->max_quant_intervals/2;
+				
+		exe_params->intvCapacity = confparams_cpr->maxRangeRadius*2;
+		exe_params->intvRadius = confparams_cpr->maxRangeRadius;
+		
+		confparams_cpr->quantization_intervals = 0;
+		exe_params->optQuantMode = 1;
+		confparams_cpr->predThreshold = 0.99;
+		confparams_cpr->sampleDistance = 100;
+		
+		confparams_cpr->szMode = SZ_BEST_COMPRESSION;
+		
+		confparams_cpr->gzipMode = 1; //fast mode
+		
+		confparams_cpr->errorBoundMode = PSNR;
+		confparams_cpr->psnr = 90;
+		
+		confparams_cpr->pw_relBoundRatio = 1E-3;
+		confparams_cpr->segment_size = 36;
+		
+		confparams_cpr->pwr_type = SZ_PWR_MIN_TYPE;
+		
+		confparams_cpr->snapshotCmprStep = 5;
+	
+		return SZ_SCES;
+	}
+    
+    if (access(sz_cfgFile, F_OK) != 0)
+    {
+        printf("[SZ] Configuration file NOT accessible.\n");
+        return SZ_NSCS;
+    }
+    
+    //printf("[SZ] Reading SZ configuration file (%s) ...\n", sz_cfgFile);    
+    ini = iniparser_load(sz_cfgFile);
+    if (ini == NULL)
+    {
+        printf("[SZ] Iniparser failed to parse the conf. file.\n");
+        return SZ_NSCS;
+    }
+
+	endianTypeString = iniparser_getstring(ini, "ENV:dataEndianType", "LITTLE_ENDIAN_DATA");
+	if(strcmp(endianTypeString, "LITTLE_ENDIAN_DATA")==0)
+		dataEndianType = LITTLE_ENDIAN_DATA;
+	else if(strcmp(endianTypeString, "BIG_ENDIAN_DATA")==0)
+		dataEndianType = BIG_ENDIAN_DATA;
+	else
+	{
+		printf("Error: Wrong dataEndianType: please set it correctly in sz.config.\n");
+		iniparser_freedict(ini);
+		return SZ_NSCS;
+	}
+
+	// Reading/setting detection parameters
+	
+	par = iniparser_getstring(ini, "ENV:sol_name", NULL);
+	snprintf(sol_name, 256, "%s", par);
+	
+    if(strcmp(sol_name, "SZ")==0)
+		confparams_cpr->sol_ID = SZ;
+	else if(strcmp(sol_name, "PASTRI")==0)
+		confparams_cpr->sol_ID = PASTRI;
+	else{
+		printf("[SZ] Error: wrong solution name (please check sz.config file)\n");
+		iniparser_freedict(ini);
+		return SZ_NSCS;
+	}
+	
+	if(confparams_cpr->sol_ID==SZ)
+	{
+		int max_quant_intervals = iniparser_getint(ini, "PARAMETER:max_quant_intervals", 65536);
+		confparams_cpr->max_quant_intervals = max_quant_intervals;
+		
+		int quantization_intervals = (int)iniparser_getint(ini, "PARAMETER:quantization_intervals", 0);
+		confparams_cpr->quantization_intervals = quantization_intervals;
+		if(quantization_intervals>0)
+		{
+			updateQuantizationInfo(quantization_intervals);
+			confparams_cpr->max_quant_intervals = max_quant_intervals = quantization_intervals;
+			exe_params->optQuantMode = 0;
+		}
+		else //==0
+		{
+			confparams_cpr->maxRangeRadius = max_quant_intervals/2;
+
+			exe_params->intvCapacity = confparams_cpr->maxRangeRadius*2;
+			exe_params->intvRadius = confparams_cpr->maxRangeRadius;
+			
+			exe_params->optQuantMode = 1;
+		}
+		
+		if(quantization_intervals%2!=0)
+		{
+			printf("Error: quantization_intervals must be an even number!\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;
+		}
+		
+		confparams_cpr->predThreshold = (float)iniparser_getdouble(ini, "PARAMETER:predThreshold", 0);
+		confparams_cpr->sampleDistance = (int)iniparser_getint(ini, "PARAMETER:sampleDistance", 0);
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:szMode", NULL);
+		if(modeBuf==NULL)
+		{
+			printf("[SZ] Error: Null szMode setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;					
+		}
+		else if(strcmp(modeBuf, "SZ_BEST_SPEED")==0)
+			confparams_cpr->szMode = SZ_BEST_SPEED;
+		else if(strcmp(modeBuf, "SZ_DEFAULT_COMPRESSION")==0)
+			confparams_cpr->szMode = SZ_DEFAULT_COMPRESSION;
+		else if(strcmp(modeBuf, "SZ_BEST_COMPRESSION")==0)
+			confparams_cpr->szMode = SZ_BEST_COMPRESSION;
+		else
+		{
+			printf("[SZ] Error: Wrong szMode setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;	
+		}
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:gzipMode", NULL);
+		if(modeBuf==NULL)
+		{
+			printf("[SZ] Error: Null Gzip mode setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;					
+		}		
+		else if(strcmp(modeBuf, "Gzip_NO_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 0;
+		else if(strcmp(modeBuf, "Gzip_BEST_SPEED")==0)
+			confparams_cpr->gzipMode = 1;
+		else if(strcmp(modeBuf, "Gzip_BEST_COMPRESSION")==0)
+			confparams_cpr->gzipMode = 9;
+		else if(strcmp(modeBuf, "Gzip_DEFAULT_COMPRESSION")==0)
+			confparams_cpr->gzipMode = -1;
+		else
+		{
+			printf("[SZ] Error: Wrong gzip Mode (please check sz.config file)\n");
+			return SZ_NSCS;
+		}
+		
+		//TODO
+		confparams_cpr->snapshotCmprStep = (int)iniparser_getint(ini, "PARAMETER:snapshotCmprStep", 5);
+				
+		errBoundMode = iniparser_getstring(ini, "PARAMETER:errorBoundMode", NULL);
+		if(errBoundMode==NULL)
+		{
+			printf("[SZ] Error: Null error bound setting (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;				
+		}
+		else if(strcmp(errBoundMode,"ABS")==0||strcmp(errBoundMode,"abs")==0)
+			confparams_cpr->errorBoundMode=ABS;
+		else if(strcmp(errBoundMode, "REL")==0||strcmp(errBoundMode,"rel")==0)
+			confparams_cpr->errorBoundMode=REL;
+		else if(strcmp(errBoundMode, "ABS_AND_REL")==0||strcmp(errBoundMode, "abs_and_rel")==0)
+			confparams_cpr->errorBoundMode=ABS_AND_REL;
+		else if(strcmp(errBoundMode, "ABS_OR_REL")==0||strcmp(errBoundMode, "abs_or_rel")==0)
+			confparams_cpr->errorBoundMode=ABS_OR_REL;
+		else if(strcmp(errBoundMode, "PW_REL")==0||strcmp(errBoundMode, "pw_rel")==0)
+			confparams_cpr->errorBoundMode=PW_REL;
+		else if(strcmp(errBoundMode, "PSNR")==0||strcmp(errBoundMode, "psnr")==0)
+			confparams_cpr->errorBoundMode=PSNR;
+		else if(strcmp(errBoundMode, "ABS_AND_PW_REL")==0||strcmp(errBoundMode, "abs_and_pw_rel")==0)
+			confparams_cpr->errorBoundMode=ABS_AND_PW_REL;
+		else if(strcmp(errBoundMode, "ABS_OR_PW_REL")==0||strcmp(errBoundMode, "abs_or_pw_rel")==0)
+			confparams_cpr->errorBoundMode=ABS_OR_PW_REL;
+		else if(strcmp(errBoundMode, "REL_AND_PW_REL")==0||strcmp(errBoundMode, "rel_and_pw_rel")==0)
+			confparams_cpr->errorBoundMode=REL_AND_PW_REL;
+		else if(strcmp(errBoundMode, "REL_OR_PW_REL")==0||strcmp(errBoundMode, "rel_or_pw_rel")==0)
+			confparams_cpr->errorBoundMode=REL_OR_PW_REL;
+		else
+		{
+			printf("[SZ] Error: Wrong error bound mode (please check sz.config file)\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;
+		}
+		
+		confparams_cpr->absErrBound = (double)iniparser_getdouble(ini, "PARAMETER:absErrBound", 0);
+		confparams_cpr->relBoundRatio = (double)iniparser_getdouble(ini, "PARAMETER:relBoundRatio", 0);
+		confparams_cpr->psnr = (double)iniparser_getdouble(ini, "PARAMETER:psnr", 0);
+		confparams_cpr->pw_relBoundRatio = (double)iniparser_getdouble(ini, "PARAMETER:pw_relBoundRatio", 0);
+		confparams_cpr->segment_size = (int)iniparser_getint(ini, "PARAMETER:segment_size", 0);
+		
+		modeBuf = iniparser_getstring(ini, "PARAMETER:pwr_type", "MIN");
+		
+		if(strcmp(modeBuf, "MIN")==0)
+			confparams_cpr->pwr_type = SZ_PWR_MIN_TYPE;
+		else if(strcmp(modeBuf, "AVG")==0)
+			confparams_cpr->pwr_type = SZ_PWR_AVG_TYPE;
+		else if(strcmp(modeBuf, "MAX")==0)
+			confparams_cpr->pwr_type = SZ_PWR_MAX_TYPE;
+		else if(modeBuf!=NULL)
+		{
+			printf("[SZ] Error: Wrong pwr_type setting (please check sz.config file).\n");
+			iniparser_freedict(ini);
+			return SZ_NSCS;	
+		}
+		else //by default
+			confparams_cpr->pwr_type = SZ_PWR_AVG_TYPE;
+    
+		//initialization for Huffman encoding
+		//SZ_Reset();	
+	}
+	else if(confparams_cpr->sol_ID == PASTRI)
+	{//load parameters for PSTRI
+		pastri_par.bf[0] = (int)iniparser_getint(ini, "PARAMETER:basisFunction_0", 0);		
+		pastri_par.bf[1] = (int)iniparser_getint(ini, "PARAMETER:basisFunction_1", 0);		
+		pastri_par.bf[2] = (int)iniparser_getint(ini, "PARAMETER:basisFunction_2", 0);		
+		pastri_par.bf[3] = (int)iniparser_getint(ini, "PARAMETER:basisFunction_3", 0);
+		pastri_par.numBlocks = (int)iniparser_getint(ini, "PARAMETER:numBlocks", 0);		
+		confparams_cpr->absErrBound = pastri_par.originalEb = (double)iniparser_getdouble(ini, "PARAMETER:absErrBound", 1E-3);
+	}
+	
+    iniparser_freedict(ini);
+    return SZ_SCES;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+    @brief      It reads and tests the configuration given.
+    @return     integer         1 if successfull.
+
+    This function reads the configuration file. Then test that the
+    configuration parameters are correct (including directories).
+
+ **/
+/*-------------------------------------------------------------------------*/
+int SZ_LoadConf(const char* sz_cfgFile) {
+    int res = SZ_ReadConf(sz_cfgFile);
+    if (res != SZ_SCES)
+    {
+        printf("[SZ] ERROR: Impossible to read configuration.\n");
+        return SZ_NSCS;
+    }
+    return SZ_SCES;
+}
+
+int checkVersion(char* version)
+{
+	int i = 0;
+	for(;i<3;i++)
+		if(version[i]!=versionNumber[i])
+			return 0;
+	return 1;
+}
+
+void initSZ_TSC()
+{
+	sz_tsc = (sz_tsc_metadata*)malloc(sizeof(sz_tsc_metadata));
+	memset(sz_tsc, 0, sizeof(sz_tsc_metadata));
+	sprintf(sz_tsc->metadata_filename, "sz_tsc_metainfo.txt");
+	sz_tsc->metadata_file = fopen(sz_tsc->metadata_filename, "wb");
+	if (sz_tsc->metadata_file == NULL)
+	{
+		printf("Failed to open sz_tsc_metainfo.txt file for writing metainfo.\n");
+		exit(1);
+	}
+	fputs("#metadata of the time-step based compression\n", sz_tsc->metadata_file);	
+}
+
+/*double fabs(double value)
+{
+	if(value<0)
+		return -value;
+	else
+		return value;
+}*/
diff --git a/thirdparty/SZ/sz/src/dataCompression.c b/thirdparty/SZ/sz/src/dataCompression.c
new file mode 100644
index 0000000000000000000000000000000000000000..0bb5ce1a6140d7bb51a12d21491ceee61ec24287
--- /dev/null
+++ b/thirdparty/SZ/sz/src/dataCompression.c
@@ -0,0 +1,597 @@
+/**
+ *  @file double_compression.c
+ *  @author Sheng Di
+ *  @date April, 2016
+ *  @brief Compression Technique for double array
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "sz.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageD.h"
+#include "CompressElement.h"
+#include "dataCompression.h"
+
+int computeByteSizePerIntValue(long valueRangeSize)
+{
+	if(valueRangeSize<=256)
+		return 1;
+	else if(valueRangeSize<=65536)
+		return 2;
+	else if(valueRangeSize<=4294967296) //2^32
+		return 4;
+	else
+		return 8;
+}
+
+long computeRangeSize_int(void* oriData, int dataType, size_t size, int64_t* valueRangeSize)
+{
+	size_t i = 0;
+	long max = 0, min = 0;
+
+	if(dataType==SZ_UINT8)
+	{
+		unsigned char* data = (unsigned char*)oriData;
+		unsigned char data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT8)
+	{
+		char* data = (char*)oriData;
+		char data_;
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT16)
+	{
+		unsigned short* data = (unsigned short*)oriData;
+		unsigned short data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT16)
+	{ 
+		short* data = (short*)oriData;
+		short data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT32)
+	{
+		unsigned int* data = (unsigned int*)oriData;
+		int data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT32)
+	{
+		int* data = (int*)oriData;
+		unsigned int data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_UINT64)
+	{
+		unsigned long* data = (unsigned long*)oriData;
+		unsigned long data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+	else if(dataType == SZ_INT64)
+	{
+		long* data = (long *)oriData;
+		long data_; 
+		min = data[0], max = min;
+		computeMinMax(data);
+	}
+
+	*valueRangeSize = max - min;
+	return min;	
+}
+
+float computeRangeSize_float(float* oriData, size_t size, float* valueRangeSize, float* medianValue)
+{
+	size_t i = 0;
+	float min = oriData[0];
+	float max = min;
+	for(i=1;i<size;i++)
+	{
+		float data = oriData[i];
+		if(min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+double computeRangeSize_double(double* oriData, size_t size, double* valueRangeSize, double* medianValue)
+{
+	size_t i = 0;
+	double min = oriData[0];
+	double max = min;
+	for(i=1;i<size;i++)
+	{
+		double data = oriData[i];
+		if(min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+	
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+float computeRangeSize_float_subblock(float* oriData, float* valueRangeSize, float* medianValue,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1)
+{
+	size_t i1, i2, i3, i4, i5;
+	size_t index_start = s5*(r4*r3*r2*r1) + s4*(r3*r2*r1) + s3*(r2*r1) + s2*r1 + s1;
+	float min = oriData[index_start];
+	float max = min;
+
+	for (i5 = s5; i5 <= e5; i5++)
+	for (i4 = s4; i4 <= e4; i4++)
+	for (i3 = s3; i3 <= e3; i3++)
+	for (i2 = s2; i2 <= e2; i2++)
+	for (i1 = s1; i1 <= e1; i1++)
+	{
+		size_t index = i5*(r4*r3*r2*r1) + i4*(r3*r2*r1) + i3*(r2*r1) + i2*r1 + i1;
+		float data = oriData[index];
+		if (min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+
+float computeRangeSize_double_subblock(double* oriData, double* valueRangeSize, double* medianValue,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1)
+{
+	size_t i1, i2, i3, i4, i5;
+	size_t index_start = s5*(r4*r3*r2*r1) + s4*(r3*r2*r1) + s3*(r2*r1) + s2*r1 + s1;
+	double min = oriData[index_start];
+	double max = min;
+
+	for (i5 = s5; i5 <= e5; i5++)
+	for (i4 = s4; i4 <= e4; i4++)
+	for (i3 = s3; i3 <= e3; i3++)
+	for (i2 = s2; i2 <= e2; i2++)
+	for (i1 = s1; i1 <= e1; i1++)
+	{
+		size_t index = i5*(r4*r3*r2*r1) + i4*(r3*r2*r1) + i3*(r2*r1) + i2*r1 + i1;
+		double data = oriData[index];
+		if (min>data)
+			min = data;
+		else if(max<data)
+			max = data;
+	}
+
+	*valueRangeSize = max - min;
+	*medianValue = min + *valueRangeSize/2;
+	return min;
+}
+
+
+double min_d(double a, double b)
+{
+	if(a<b)
+		return a;
+	else
+		return b;
+}
+
+double max_d(double a, double b)
+{
+	if(a>b)
+		return a;
+	else
+		return b;
+}
+
+float min_f(float a, float b)
+{
+	if(a<b)
+		return a;
+	else
+		return b;
+}
+
+float max_f(float a, float b)
+{
+	if(a>b)
+		return a;
+	else
+		return b;
+}
+
+double getRealPrecision_double(double valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_d(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_d(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = 0;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+double getRealPrecision_float(float valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = 0;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+double getRealPrecision_int(long valueRangeSize, int errBoundMode, double absErrBound, double relBoundRatio, int *status)
+{
+	int state = SZ_SCES;
+	double precision = 0;
+	if(errBoundMode==ABS||errBoundMode==ABS_OR_PW_REL||errBoundMode==ABS_AND_PW_REL)
+		precision = absErrBound; 
+	else if(errBoundMode==REL||errBoundMode==REL_OR_PW_REL||errBoundMode==REL_AND_PW_REL)
+		precision = relBoundRatio*valueRangeSize;
+	else if(errBoundMode==ABS_AND_REL)
+		precision = min_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==ABS_OR_REL)
+		precision = max_f(absErrBound, relBoundRatio*valueRangeSize);
+	else if(errBoundMode==PW_REL)
+		precision = -1;
+	else
+	{
+		printf("Error: error-bound-mode is incorrect!\n");
+		state = SZ_BERR;
+	}
+	*status = state;
+	return precision;
+}
+
+void symTransform_8bytes(unsigned char data[8])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[7];
+	data[7] = tmp;
+
+	tmp = data[1];
+	data[1] = data[6];
+	data[6] = tmp;
+	
+	tmp = data[2];
+	data[2] = data[5];
+	data[5] = tmp;
+	
+	tmp = data[3];
+	data[3] = data[4];
+	data[4] = tmp;
+}
+
+inline void symTransform_2bytes(unsigned char data[2])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[1];
+	data[1] = tmp;
+}
+
+inline void symTransform_4bytes(unsigned char data[4])
+{
+	unsigned char tmp = data[0];
+	data[0] = data[3];
+	data[3] = tmp;
+
+	tmp = data[1];
+	data[1] = data[2];
+	data[2] = tmp;
+}
+
+inline void compressInt8Value(int8_t tgtValue, int8_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint8_t data = tgtValue - minValue;
+	memcpy(bytes, &data, byteSize); //byteSize==1
+}
+
+inline void compressInt16Value(int16_t tgtValue, int16_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint16_t data = tgtValue - minValue;
+	unsigned char tmpBytes[2];
+	int16ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 2 - byteSize, byteSize);
+}
+
+inline void compressInt32Value(int32_t tgtValue, int32_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint32_t data = tgtValue - minValue;
+	unsigned char tmpBytes[4];
+	int32ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 4 - byteSize, byteSize);
+}
+
+inline void compressInt64Value(int64_t tgtValue, int64_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint64_t data = tgtValue - minValue;
+	unsigned char tmpBytes[8];
+	int64ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 8 - byteSize, byteSize);
+}
+
+inline void compressUInt8Value(uint8_t tgtValue, uint8_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint8_t data = tgtValue - minValue;
+	memcpy(bytes, &data, byteSize); //byteSize==1
+}
+
+inline void compressUInt16Value(uint16_t tgtValue, uint16_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint16_t data = tgtValue - minValue;
+	unsigned char tmpBytes[2];
+	int16ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 2 - byteSize, byteSize);
+}
+
+inline void compressUInt32Value(uint32_t tgtValue, uint32_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint32_t data = tgtValue - minValue;
+	unsigned char tmpBytes[4];
+	int32ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 4 - byteSize, byteSize);
+}
+
+inline void compressUInt64Value(uint64_t tgtValue, uint64_t minValue, int byteSize, unsigned char* bytes)
+{
+	uint64_t data = tgtValue - minValue;
+	unsigned char tmpBytes[8];
+	int64ToBytes_bigEndian(tmpBytes, data);
+	memcpy(bytes, tmpBytes + 8 - byteSize, byteSize);
+}
+
+void compressSingleFloatValue(FloatValueCompressElement *vce, float tgtValue, float precision, float medianValue, 
+		int reqLength, int reqBytesLength, int resiBitsLength)
+{		
+	float normValue = tgtValue - medianValue;
+
+	lfloat lfBuf;
+	lfBuf.value = normValue;
+			
+	int ignBytesLength = 32 - reqLength;
+	if(ignBytesLength<0)
+		ignBytesLength = 0;
+	
+	int tmp_int = lfBuf.ivalue;
+	intToBytes_bigEndian(vce->curBytes, tmp_int);
+		
+	lfBuf.ivalue = (lfBuf.ivalue >> ignBytesLength) << ignBytesLength;
+	
+	//float tmpValue = lfBuf.value;
+	
+	vce->data = lfBuf.value+medianValue;
+	vce->curValue = tmp_int;
+	vce->reqBytesLength = reqBytesLength;
+	vce->resiBitsLength = resiBitsLength;
+}
+
+void compressSingleDoubleValue(DoubleValueCompressElement *vce, double tgtValue, double precision, double medianValue, 
+		int reqLength, int reqBytesLength, int resiBitsLength)
+{		
+	double normValue = tgtValue - medianValue;
+
+	ldouble lfBuf;
+	lfBuf.value = normValue;
+			
+	int ignBytesLength = 64 - reqLength;
+	if(ignBytesLength<0)
+		ignBytesLength = 0;
+
+	long tmp_long = lfBuf.lvalue;
+	longToBytes_bigEndian(vce->curBytes, tmp_long);
+				
+	lfBuf.lvalue = (lfBuf.lvalue >> ignBytesLength)<<ignBytesLength;
+	
+	//double tmpValue = lfBuf.value;
+	
+	vce->data = lfBuf.value+medianValue;
+	vce->curValue = tmp_long;
+	vce->reqBytesLength = reqBytesLength;
+	vce->resiBitsLength = resiBitsLength;
+}
+
+int compIdenticalLeadingBytesCount_double(unsigned char* preBytes, unsigned char* curBytes)
+{
+	int i, n = 0;
+	for(i=0;i<8;i++)
+		if(preBytes[i]==curBytes[i])
+			n++;
+		else
+			break;
+	if(n>3) n = 3;
+	return n;
+}
+
+int compIdenticalLeadingBytesCount_float(unsigned char* preBytes, unsigned char* curBytes)
+{
+	int i, n = 0;
+	for(i=0;i<4;i++)
+		if(preBytes[i]==curBytes[i])
+			n++;
+		else
+			break;
+	if(n>3) n = 3;
+	return n;
+}
+
+//TODO double-check the correctness...
+void addExactData(DynamicByteArray *exactMidByteArray, DynamicIntArray *exactLeadNumArray, 
+		DynamicIntArray *resiBitArray, LossyCompressionElement *lce)
+{
+	int i;
+	int leadByteLength = lce->leadingZeroBytes;
+	addDIA_Data(exactLeadNumArray, leadByteLength);
+	unsigned char* intMidBytes = lce->integerMidBytes;
+	int integerMidBytesLength = lce->integerMidBytes_Length;
+	int resMidBitsLength = lce->resMidBitsLength;
+	if(intMidBytes!=NULL||resMidBitsLength!=0)
+	{
+		if(intMidBytes!=NULL)
+			for(i = 0;i<integerMidBytesLength;i++)
+				addDBA_Data(exactMidByteArray, intMidBytes[i]);
+		if(resMidBitsLength!=0)
+			addDIA_Data(resiBitArray, lce->residualMidBits);
+	}
+}
+
+/**
+ * @deprecated
+ * @return: the length of the coefficient array.
+ * */
+int getPredictionCoefficients(int layers, int dimension, int **coeff_array, int *status)
+{
+	size_t size = 0;
+	switch(dimension)
+	{
+		case 1:
+			switch(layers)
+			{
+				case 1:
+					*coeff_array = (int*)malloc(sizeof(int));
+					(*coeff_array)[0] = 1;
+					size = 1;
+					break;
+				case 2:
+					*coeff_array = (int*)malloc(2*sizeof(int));
+					(*coeff_array)[0] = 2;
+					(*coeff_array)[1] = -1;
+					size = 2;
+					break;
+				case 3:
+					*coeff_array = (int*)malloc(3*sizeof(int));
+					(*coeff_array)[0] = 3;
+					(*coeff_array)[1] = -3;
+					(*coeff_array)[2] = 1;
+					break;
+			}	
+			break;
+		case 2:
+			switch(layers)
+			{
+				case 1:
+				
+					break;
+				case 2:
+				
+					break;
+				case 3:
+				
+					break;
+			}				
+			break;
+		case 3:
+			switch(layers)
+			{
+				case 1:
+				
+					break;
+				case 2:
+				
+					break;
+				case 3:
+				
+					break;
+			}			
+			break;
+		default:
+			printf("Error: dimension must be no greater than 3 in the current version.\n");
+			*status = SZ_DERR;
+	}
+	*status = SZ_SCES;
+	return size;
+}
+
+int computeBlockEdgeSize_2D(int segmentSize)
+{
+	int i = 1;
+	for(i=1; i<segmentSize;i++)
+	{
+		if(i*i>segmentSize)
+			break;
+	}
+	return i;
+	//return (int)(sqrt(segmentSize)+1);
+}
+
+int computeBlockEdgeSize_3D(int segmentSize)
+{
+	int i = 1;
+	for(i=1; i<segmentSize;i++)
+	{
+		if(i*i*i>segmentSize)
+			break;
+	}
+	return i;	
+	//return (int)(pow(segmentSize, 1.0/3)+1);
+}
+
+//convert random-access version based bytes to output bytes
+int initRandomAccessBytes(unsigned char* raBytes)
+{
+        int k = 0, i = 0;
+        for (i = 0; i < 3; i++)//3
+                raBytes[k++] = versionNumber[i];
+        int sameByte = 0x80; //indicating this is random-access mode
+        if(exe_params->SZ_SIZE_TYPE==8)
+                sameByte = (unsigned char) (sameByte | 0x40); // 01000000, the 6th bit
+        sameByte = sameByte | (confparams_cpr->szMode << 1);
+
+        raBytes[k++] = sameByte;
+
+        convertSZParamsToBytes(confparams_cpr, &(raBytes[k]));
+        k = k + MetaDataByteLength;
+
+        return k;
+}
diff --git a/thirdparty/SZ/sz/src/dictionary.c b/thirdparty/SZ/sz/src/dictionary.c
new file mode 100644
index 0000000000000000000000000000000000000000..3f0f5cfa63a862fa515e9e2d21674ad61b7c2f6f
--- /dev/null
+++ b/thirdparty/SZ/sz/src/dictionary.c
@@ -0,0 +1,398 @@
+/*-------------------------------------------------------------------------*/
+/**
+   @file    dictionary.c
+   @author  N. Devillard
+   @brief   Implements a dictionary for string variables.
+
+   This module implements a simple dictionary object, i.e. a list
+   of string/string associations. This object is useful to store e.g.
+   informations retrieved from a configuration file (ini files).
+*/
+/*--------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------
+                                Includes
+ ---------------------------------------------------------------------------*/
+#include "dictionary.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/** Maximum value size for integers and doubles. */
+#define MAXVALSZ    1024
+
+/** Minimal allocated number of entries in a dictionary */
+#define DICTMINSZ   128
+
+/** Invalid key token */
+#define DICT_INVALID_KEY    ((char*)-1)
+
+/*---------------------------------------------------------------------------
+                            Private functions
+ ---------------------------------------------------------------------------*/
+
+/* Doubles the allocated size associated to a pointer */
+/* 'size' is the current allocated size. */
+static void * mem_double(void * ptr, int size)
+{
+    void * newptr ;
+ 
+    newptr = calloc(2*size, 1);
+    if (newptr==NULL) {
+        return NULL ;
+    }
+    memcpy(newptr, ptr, size);
+    free(ptr);
+    return newptr ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Duplicate a string
+  @param    s String to duplicate
+  @return   Pointer to a newly allocated string, to be freed with free()
+
+  This is a replacement for strdup(). This implementation is provided
+  for systems that do not have it.
+ */
+/*--------------------------------------------------------------------------*/
+static char * xstrdup(const char * s)
+{
+    char * t ;
+    if (!s)
+        return NULL ;
+    t = (char*)malloc(strlen(s)+1) ;
+    if (t) {
+        strcpy(t,s);
+    }
+    return t ;
+}
+
+/*---------------------------------------------------------------------------
+                            Function codes
+ ---------------------------------------------------------------------------*/
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Compute the hash key for a string.
+  @param    key     Character string to use for key.
+  @return   1 unsigned int on at least 32 bits.
+
+  This hash function has been taken from an Article in Dr Dobbs Journal.
+  This is normally a collision-free function, distributing keys evenly.
+  The key is stored anyway in the struct so that collision can be avoided
+  by comparing the key itself in last resort.
+ */
+/*--------------------------------------------------------------------------*/
+unsigned dictionary_hash(const char * key)
+{
+    int         len ;
+    unsigned    hash ;
+    int         i ;
+
+    len = strlen(key);
+    for (hash=0, i=0 ; i<len ; i++) {
+        hash += (unsigned)key[i] ;
+        hash += (hash<<10);
+        hash ^= (hash>>6) ;
+    }
+    hash += (hash <<3);
+    hash ^= (hash >>11);
+    hash += (hash <<15);
+    return hash ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Create a new dictionary object.
+  @param    size    Optional initial size of the dictionary.
+  @return   1 newly allocated dictionary objet.
+
+  This function allocates a new dictionary object of given size and returns
+  it. If you do not know in advance (roughly) the number of entries in the
+  dictionary, give size=0.
+ */
+/*--------------------------------------------------------------------------*/
+dictionary * dictionary_new(int size)
+{
+    dictionary  *   d ;
+
+    /* If no size was specified, allocate space for DICTMINSZ */
+    if (size<DICTMINSZ) size=DICTMINSZ ;
+
+    if (!(d = (dictionary *)calloc(1, sizeof(dictionary)))) {
+        return NULL;
+    }
+    d->size = size ;
+    d->val  = (char **)calloc(size, sizeof(char*));
+    d->key  = (char **)calloc(size, sizeof(char*));
+    d->hash = (unsigned int *)calloc(size, sizeof(unsigned));
+    return d ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete a dictionary object
+  @param    d   dictionary object to deallocate.
+  @return   void
+
+  Deallocate a dictionary object and all memory associated to it.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_del(dictionary * d)
+{
+    int     i ;
+
+    if (d==NULL) return ;
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]!=NULL)
+            free(d->key[i]);
+        if (d->val[i]!=NULL)
+            free(d->val[i]);
+    }
+    free(d->val);
+    free(d->key);
+    free(d->hash);
+    free(d);
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get a value from a dictionary.
+  @param    d       dictionary object to search.
+  @param    key     Key to look for in the dictionary.
+  @param    def     Default value to return if key not found.
+  @return   1 pointer to internally allocated character string.
+
+  This function locates a key in a dictionary and returns a pointer to its
+  value, or the passed 'def' pointer if no such key can be found in
+  dictionary. The returned character pointer points to data internal to the
+  dictionary object, you should not try to free it or modify it.
+ */
+/*--------------------------------------------------------------------------*/
+char * dictionary_get(dictionary * d, const char * key, char * def)
+{
+    unsigned    hash ;
+    int         i ;
+
+    hash = dictionary_hash(key);
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        /* Compare hash */
+        if (hash==d->hash[i]) {
+            /* Compare string, to avoid hash collisions */
+            if (!strcmp(key, d->key[i])) {
+                return d->val[i] ;
+            }
+        }
+    }
+    return def ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Set a value in a dictionary.
+  @param    d       dictionary object to modify.
+  @param    key     Key to modify or add.
+  @param    val     Value to add.
+  @return   int     0 if Ok, anything else otherwise
+
+  If the given key is found in the dictionary, the associated value is
+  replaced by the provided one. If the key cannot be found in the
+  dictionary, it is added to it.
+
+  It is Ok to provide a NULL value for val, but NULL values for the dictionary
+  or the key are considered as errors: the function will return immediately
+  in such a case.
+
+  Notice that if you dictionary_set a variable to NULL, a call to
+  dictionary_get will return a NULL value: the variable will be found, and
+  its value (NULL) is returned. In other words, setting the variable
+  content to NULL is equivalent to deleting the variable from the
+  dictionary. It is not possible (in this implementation) to have a key in
+  the dictionary without value.
+
+  This function returns non-zero in case of failure.
+ */
+/*--------------------------------------------------------------------------*/
+int dictionary_set(dictionary * d, const char * key, const char * val)
+{
+    int         i ;
+    unsigned    hash ;
+
+    if (d==NULL || key==NULL) return -1 ;
+    
+    /* Compute hash for this key */
+    hash = dictionary_hash(key) ;
+    /* Find if value is already in dictionary */
+    if (d->n>0) {
+        for (i=0 ; i<d->size ; i++) {
+            if (d->key[i]==NULL)
+                continue ;
+            if (hash==d->hash[i]) { /* Same hash value */
+                if (!strcmp(key, d->key[i])) {   /* Same key */
+                    /* Found a value: modify and return */
+                    if (d->val[i]!=NULL)
+                        free(d->val[i]);
+                    d->val[i] = val ? xstrdup(val) : NULL ;
+                    /* Value has been modified: return */
+                    return 0 ;
+                }
+            }
+        }
+    }
+    /* Add a new value */
+    /* See if dictionary needs to grow */
+    if (d->n==d->size) {
+
+        /* Reached maximum size: reallocate dictionary */
+        d->val  = (char **)mem_double(d->val,  d->size * sizeof(char*)) ;
+        d->key  = (char **)mem_double(d->key,  d->size * sizeof(char*)) ;
+        d->hash = (unsigned int *)mem_double(d->hash, d->size * sizeof(unsigned)) ;
+        if ((d->val==NULL) || (d->key==NULL) || (d->hash==NULL)) {
+            /* Cannot grow dictionary */
+            return -1 ;
+        }
+        /* Double size */
+        d->size *= 2 ;
+    }
+
+    /* Insert key in the first empty slot. Start at d->n and wrap at
+       d->size. Because d->n < d->size this will necessarily
+       terminate. */
+    for (i=d->n ; d->key[i] ; ) {
+        if(++i == d->size) i = 0;
+    }
+    /* Copy key */
+    d->key[i]  = xstrdup(key);
+    d->val[i]  = val ? xstrdup(val) : NULL ;
+    d->hash[i] = hash;
+    d->n ++ ;
+    return 0 ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete a key in a dictionary
+  @param    d       dictionary object to modify.
+  @param    key     Key to remove.
+  @return   void
+
+  This function deletes a key in a dictionary. Nothing is done if the
+  key cannot be found.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_unset(dictionary * d, const char * key)
+{
+    unsigned    hash ;
+    int         i ;
+
+    if (key == NULL) {
+        return;
+    }
+
+    hash = dictionary_hash(key);
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        /* Compare hash */
+        if (hash==d->hash[i]) {
+            /* Compare string, to avoid hash collisions */
+            if (!strcmp(key, d->key[i])) {
+                /* Found key */
+                break ;
+            }
+        }
+    }
+    if (i>=d->size)
+        /* Key not found */
+        return ;
+
+    free(d->key[i]);
+    d->key[i] = NULL ;
+    if (d->val[i]!=NULL) {
+        free(d->val[i]);
+        d->val[i] = NULL ;
+    }
+    d->hash[i] = 0 ;
+    d->n -- ;
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dump a dictionary to an opened file pointer.
+  @param    d   Dictionary to dump
+  @param    f   Opened file pointer.
+  @return   void
+
+  Dumps a dictionary onto an opened file pointer. Key pairs are printed out
+  as @c [Key]=[Value], one per line. It is Ok to provide stdout or stderr as
+  output file pointers.
+ */
+/*--------------------------------------------------------------------------*/
+void dictionary_dump(dictionary * d, FILE * out)
+{
+    int     i ;
+
+    if (d==NULL || out==NULL) return ;
+    if (d->n<1) {
+        fprintf(out, "empty dictionary\n");
+        return ;
+    }
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]) {
+            fprintf(out, "%20s\t[%s]\n",
+                    d->key[i],
+                    d->val[i] ? d->val[i] : "UNDEF");
+        }
+    }
+    return ;
+}
+
+
+/* Test code */
+#ifdef TESTDIC
+#define NVALS 20000
+int main(int argc, char *argv[])
+{
+    dictionary  *   d ;
+    char    *   val ;
+    int         i ;
+    char        cval[90] ;
+
+    /* Allocate dictionary */
+    printf("allocating...\n");
+    d = dictionary_new(0);
+    
+    /* Set values in dictionary */
+    printf("setting %d values...\n", NVALS);
+    for (i=0 ; i<NVALS ; i++) {
+        sprintf(cval, "%04d", i);
+        dictionary_set(d, cval, "salut");
+    }
+    printf("getting %d values...\n", NVALS);
+    for (i=0 ; i<NVALS ; i++) {
+        sprintf(cval, "%04d", i);
+        val = dictionary_get(d, cval, DICT_INVALID_KEY);
+        if (val==DICT_INVALID_KEY) {
+            printf("cannot get value for key [%s]\n", cval);
+        }
+    }
+    printf("unsetting %d values...\n", NVALS);
+    for (i=0 ; i<NVALS ; i++) {
+        sprintf(cval, "%04d", i);
+        dictionary_unset(d, cval);
+    }
+    if (d->n != 0) {
+        printf("error deleting values\n");
+    }
+    printf("deallocating...\n");
+    dictionary_del(d);
+    return 0 ;
+}
+#endif
+/* vim: set ts=4 et sw=4 tw=75 */
diff --git a/thirdparty/SZ/sz/src/iniparser.c b/thirdparty/SZ/sz/src/iniparser.c
new file mode 100644
index 0000000000000000000000000000000000000000..b076ed1d0f885b419c170eff5924614cd9239350
--- /dev/null
+++ b/thirdparty/SZ/sz/src/iniparser.c
@@ -0,0 +1,774 @@
+
+/*-------------------------------------------------------------------------*/
+/**
+   @file    iniparser.c
+   @author  N. Devillard
+   @brief   Parser for ini files.
+*/
+/*--------------------------------------------------------------------------*/
+/*---------------------------- Includes ------------------------------------*/
+#include <ctype.h>
+#include "iniparser.h"
+
+/*---------------------------- Defines -------------------------------------*/
+#define ASCIILINESZ         (1024)
+#define INI_INVALID_KEY     ((char*)-1)
+
+/*---------------------------------------------------------------------------
+                        Private to this module
+ ---------------------------------------------------------------------------*/
+/**
+ * This enum stores the status for each parsed line (internal use only).
+ */
+typedef enum _line_status_ {
+    LINE_UNPROCESSED,
+    LINE_ERROR,
+    LINE_EMPTY,
+    LINE_COMMENT,
+    LINE_SECTION,
+    LINE_VALUE
+} line_status ;
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Convert a string to lowercase.
+  @param    s   String to convert.
+  @return   ptr to statically allocated string.
+
+  This function returns a pointer to a statically allocated string
+  containing a lowercased version of the input string. Do not free
+  or modify the returned string! Since the returned string is statically
+  allocated, it will be modified at each function call (not re-entrant).
+ */
+/*--------------------------------------------------------------------------*/
+static char * strlwc(const char * s)
+{
+    static char l[ASCIILINESZ+1];
+    int i ;
+
+    if (s==NULL) return NULL ;
+    memset(l, 0, ASCIILINESZ+1);
+    i=0 ;
+    while (s[i] && i<ASCIILINESZ) {
+        l[i] = (char)tolower((int)s[i]);
+        i++ ;
+    }
+    l[ASCIILINESZ]=(char)0;
+    return l ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Remove blanks at the beginning and the end of a string.
+  @param    s   String to parse.
+  @return   ptr to statically allocated string.
+
+  This function returns a pointer to a statically allocated string,
+  which is identical to the input string, except that all blank
+  characters at the end and the beg. of the string have been removed.
+  Do not free or modify the returned string! Since the returned string
+  is statically allocated, it will be modified at each function call
+  (not re-entrant).
+ */
+/*--------------------------------------------------------------------------*/
+static char * strstrip(const char * s)
+{
+    static char l[ASCIILINESZ+1];
+    char * last;
+
+    if (s==NULL) return NULL ;
+
+    while (isspace((int)*s) && *s) s++;
+    memset(l, 0, ASCIILINESZ+1);
+    strncpy(l, s, ASCIILINESZ);
+    last = l + strlen(l);
+    while (last > l) {
+        if (!isspace((int)*(last-1)))
+            break ;
+        last -- ;
+    }
+    *last = (char)0;
+    return (char*)l ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get number of sections in a dictionary
+  @param    d   Dictionary to examine
+  @return   int Number of sections found in dictionary
+
+  This function returns the number of sections found in a dictionary.
+  The test to recognize sections is done on the string stored in the
+  dictionary: a section name is given as "section" whereas a key is
+  stored as "section:key", thus the test looks for entries that do not
+  contain a colon.
+
+  This clearly fails in the case a section name contains a colon, but
+  this should simply be avoided.
+
+  This function returns -1 in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getnsec(dictionary * d)
+{
+    int i ;
+    int nsec ;
+
+    if (d==NULL) return -1 ;
+    nsec=0 ;
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        if (strchr(d->key[i], ':')==NULL) {
+            nsec ++ ;
+        }
+    }
+    return nsec ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get name for section n in a dictionary.
+  @param    d   Dictionary to examine
+  @param    n   Section number (from 0 to nsec-1).
+  @return   Pointer to char string
+
+  This function locates the n-th section in a dictionary and returns
+  its name as a pointer to a string statically allocated inside the
+  dictionary. Do not free or modify the returned string!
+
+  This function returns NULL in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+char * iniparser_getsecname(dictionary * d, int n)
+{
+    int i ;
+    int foundsec ;
+
+    if (d==NULL || n<0) return NULL ;
+    foundsec=0 ;
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        if (strchr(d->key[i], ':')==NULL) {
+            foundsec++ ;
+            if (foundsec>n)
+                break ;
+        }
+    }
+    if (foundsec<=n) {
+        return NULL ;
+    }
+    return d->key[i] ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Dump a dictionary to an opened file pointer.
+  @param    d   Dictionary to dump.
+  @param    f   Opened file pointer to dump to.
+  @return   void
+
+  This function prints out the contents of a dictionary, one element by
+  line, onto the provided file pointer. It is OK to specify @c stderr
+  or @c stdout as output files. This function is meant for debugging
+  purposes mostly.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_dump(dictionary * d, FILE * f)
+{
+    int     i ;
+
+    if (d==NULL || f==NULL) return ;
+    for (i=0 ; i<d->size ; i++) {
+        if (d->key[i]==NULL)
+            continue ;
+        if (d->val[i]!=NULL) {
+            fprintf(f, "[%s]=[%s]\n", d->key[i], d->val[i]);
+        } else {
+            fprintf(f, "[%s]=UNDEF\n", d->key[i]);
+        }
+    }
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Save a dictionary to a loadable ini file
+  @param    d   Dictionary to dump
+  @param    f   Opened file pointer to dump to
+  @return   void
+
+  This function dumps a given dictionary into a loadable ini file.
+  It is Ok to specify @c stderr or @c stdout as output files.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_dump_ini(dictionary * d, FILE * f)
+{
+    int     i ;
+    int     nsec ;
+    char *  secname ;
+
+    if (d==NULL || f==NULL) return ;
+
+    nsec = iniparser_getnsec(d);
+    if (nsec<1) {
+        /* No section in file: dump all keys as they are */
+        for (i=0 ; i<d->size ; i++) {
+            if (d->key[i]==NULL)
+                continue ;
+            fprintf(f, "%s = %s\n", d->key[i], d->val[i]);
+        }
+        return ;
+    }
+    for (i=0 ; i<nsec ; i++) {
+        secname = iniparser_getsecname(d, i) ;
+        iniparser_dumpsection_ini(d, secname, f) ;
+    }
+    fprintf(f, "\n");
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Save a dictionary section to a loadable ini file
+  @param    d   Dictionary to dump
+  @param    s   Section name of dictionary to dump
+  @param    f   Opened file pointer to dump to
+  @return   void
+
+  This function dumps a given section of a given dictionary into a loadable ini
+  file.  It is Ok to specify @c stderr or @c stdout as output files.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_dumpsection_ini(dictionary * d, char * s, FILE * f)
+{
+    int     j ;
+    char    keym[ASCIILINESZ+1];
+    int     seclen ;
+
+    if (d==NULL || f==NULL) return ;
+    if (! iniparser_find_entry(d, s)) return ;
+
+    seclen  = (int)strlen(s);
+    fprintf(f, "\n[%s]\n", s);
+    sprintf(keym, "%s:", s);
+    for (j=0 ; j<d->size ; j++) {
+        if (d->key[j]==NULL)
+            continue ;
+        if (!strncmp(d->key[j], keym, seclen+1)) {
+            fprintf(f,
+                    "%-30s = %s\n",
+                    d->key[j]+seclen+1,
+                    d->val[j] ? d->val[j] : "");
+        }
+    }
+    fprintf(f, "\n");
+    return ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the number of keys in a section of a dictionary.
+  @param    d   Dictionary to examine
+  @param    s   Section name of dictionary to examine
+  @return   Number of keys in section
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getsecnkeys(dictionary * d, char * s)
+{
+    int     seclen, nkeys ;
+    char    keym[ASCIILINESZ+1];
+    int j ;
+
+    nkeys = 0;
+
+    if (d==NULL) return nkeys;
+    if (! iniparser_find_entry(d, s)) return nkeys;
+
+    seclen  = (int)strlen(s);
+    sprintf(keym, "%s:", s);
+
+    for (j=0 ; j<d->size ; j++) {
+        if (d->key[j]==NULL)
+            continue ;
+        if (!strncmp(d->key[j], keym, seclen+1))
+            nkeys++;
+    }
+
+    return nkeys;
+
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the number of keys in a section of a dictionary.
+  @param    d   Dictionary to examine
+  @param    s   Section name of dictionary to examine
+  @return   pointer to statically allocated character strings
+
+  This function queries a dictionary and finds all keys in a given section.
+  Each pointer in the returned char pointer-to-pointer is pointing to
+  a string allocated in the dictionary; do not free or modify them.
+
+  This function returns NULL in case of error.
+ */
+/*--------------------------------------------------------------------------*/
+char ** iniparser_getseckeys(dictionary * d, char * s)
+{
+
+    char **keys;
+
+    int i, j ;
+    char    keym[ASCIILINESZ+1];
+    int     seclen, nkeys ;
+
+    keys = NULL;
+
+    if (d==NULL) return keys;
+    if (! iniparser_find_entry(d, s)) return keys;
+
+    nkeys = iniparser_getsecnkeys(d, s);
+
+    keys = (char**) malloc(nkeys*sizeof(char*));
+
+    seclen  = (int)strlen(s);
+    sprintf(keym, "%s:", s);
+
+    i = 0;
+
+    for (j=0 ; j<d->size ; j++) {
+        if (d->key[j]==NULL)
+            continue ;
+        if (!strncmp(d->key[j], keym, seclen+1)) {
+            keys[i] = d->key[j];
+            i++;
+        }
+    }
+
+    return keys;
+
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key
+  @param    d       Dictionary to search
+  @param    key     Key string to look for
+  @param    def     Default value to return if key not found.
+  @return   pointer to statically allocated character string
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the pointer passed as 'def' is returned.
+  The returned char pointer is pointing to a string allocated in
+  the dictionary, do not free or modify it.
+ */
+/*--------------------------------------------------------------------------*/
+char * iniparser_getstring(dictionary * d, const char * key, char * def)
+{
+    char * lc_key ;
+    char * sval ;
+
+    if (d==NULL || key==NULL)
+        return def ;
+
+    lc_key = strlwc(key);
+    sval = dictionary_get(d, lc_key, def);
+    return sval ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to an int
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   integer
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+
+  Supported values for integers include the usual C notation
+  so decimal, octal (starting with 0) and hexadecimal (starting with 0x)
+  are supported. Examples:
+
+  "42"      ->  42
+  "042"     ->  34 (octal -> decimal)
+  "0x42"    ->  66 (hexa  -> decimal)
+
+  Warning: the conversion may overflow in various ways. Conversion is
+  totally outsourced to strtol(), see the associated man page for overflow
+  handling.
+
+  Credits: Thanks to A. Becker for suggesting strtol()
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getint(dictionary * d, const char * key, int notfound)
+{
+    char    *   str ;
+
+    str = iniparser_getstring(d, key, INI_INVALID_KEY);
+    if (str==INI_INVALID_KEY) return notfound ;
+    return (int)strtol(str, NULL, 0);
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a long
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   long
+
+  Credits: This function bases completely on int iniparser_getint and was
+  slightly modified to return long instead of int.
+ */
+/*--------------------------------------------------------------------------*/
+long iniparser_getlint(dictionary * d, const char * key, int notfound)
+{
+    char    *   str ;
+
+    str = iniparser_getstring(d, key, INI_INVALID_KEY);
+    if (str==INI_INVALID_KEY) return notfound ;
+    return strtol(str, NULL, 0);
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a double
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   double
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+ */
+/*--------------------------------------------------------------------------*/
+double iniparser_getdouble(dictionary * d, const char * key, double notfound)
+{
+    char    *   str ;
+
+    str = iniparser_getstring(d, key, INI_INVALID_KEY);
+    if (str==INI_INVALID_KEY) return notfound ;
+    return atof(str);
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Get the string associated to a key, convert to a boolean
+  @param    d Dictionary to search
+  @param    key Key string to look for
+  @param    notfound Value to return in case of error
+  @return   integer
+
+  This function queries a dictionary for a key. A key as read from an
+  ini file is given as "section:key". If the key cannot be found,
+  the notfound value is returned.
+
+  A true boolean is found if one of the following is matched:
+
+  - A string starting with 'y'
+  - A string starting with 'Y'
+  - A string starting with 't'
+  - A string starting with 'T'
+  - A string starting with '1'
+
+  A false boolean is found if one of the following is matched:
+
+  - A string starting with 'n'
+  - A string starting with 'N'
+  - A string starting with 'f'
+  - A string starting with 'F'
+  - A string starting with '0'
+
+  The notfound value returned if no boolean is identified, does not
+  necessarily have to be 0 or 1.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_getboolean(dictionary * d, const char * key, int notfound)
+{
+    char    *   c ;
+    int         ret ;
+
+    c = iniparser_getstring(d, key, INI_INVALID_KEY);
+    if (c==INI_INVALID_KEY) return notfound ;
+    if (c[0]=='y' || c[0]=='Y' || c[0]=='1' || c[0]=='t' || c[0]=='T') {
+        ret = 1 ;
+    } else if (c[0]=='n' || c[0]=='N' || c[0]=='0' || c[0]=='f' || c[0]=='F') {
+        ret = 0 ;
+    } else {
+        ret = notfound ;
+    }
+    return ret;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Finds out if a given entry exists in a dictionary
+  @param    ini     Dictionary to search
+  @param    entry   Name of the entry to look for
+  @return   integer 1 if entry exists, 0 otherwise
+
+  Finds out if a given entry exists in the dictionary. Since sections
+  are stored as keys with NULL associated values, this is the only way
+  of querying for the presence of sections in a dictionary.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_find_entry(
+    dictionary  *   ini,
+    const char  *   entry
+)
+{
+    int found=0 ;
+    if (iniparser_getstring(ini, entry, INI_INVALID_KEY)!=INI_INVALID_KEY) {
+        found = 1 ;
+    }
+    return found ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Set an entry in a dictionary.
+  @param    ini     Dictionary to modify.
+  @param    entry   Entry to modify (entry name)
+  @param    val     New value to associate to the entry.
+  @return   int 0 if Ok, -1 otherwise.
+
+  If the given entry can be found in the dictionary, it is modified to
+  contain the provided value. If it cannot be found, -1 is returned.
+  It is Ok to set val to NULL.
+ */
+/*--------------------------------------------------------------------------*/
+int iniparser_set(dictionary * ini, const char * entry, const char * val)
+{
+    return dictionary_set(ini, strlwc(entry), val) ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Delete an entry in a dictionary
+  @param    ini     Dictionary to modify
+  @param    entry   Entry to delete (entry name)
+  @return   void
+
+  If the given entry can be found, it is deleted from the dictionary.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_unset(dictionary * ini, const char * entry)
+{
+    dictionary_unset(ini, strlwc(entry));
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Load a single line from an INI file
+  @param    input_line  Input line, may be concatenated multi-line input
+  @param    section     Output space to store section
+  @param    key         Output space to store key
+  @param    value       Output space to store value
+  @return   line_status value
+ */
+/*--------------------------------------------------------------------------*/
+static line_status iniparser_line(
+    const char * input_line,
+    char * section,
+    char * key,
+    char * value)
+{
+    line_status sta ;
+    char        line[ASCIILINESZ+1];
+    int         len ;
+
+    memset(line, 0, ASCIILINESZ + 1);
+    len = (int)strlen(strstrip(input_line));
+    if (len > ASCIILINESZ)
+        len = ASCIILINESZ;
+    strncpy(line, strstrip(input_line), len);
+    len = (int)strlen(line);
+
+    sta = LINE_UNPROCESSED ;
+    if (len<1) {
+        /* Empty line */
+        sta = LINE_EMPTY ;
+    } else if (line[0]=='#' || line[0]==';') {
+        /* Comment line */
+        sta = LINE_COMMENT ;
+    } else if (line[0]=='[' && line[len-1]==']') {
+        /* Section name */
+        sscanf(line, "[%[^]]", section);
+        strcpy(section, strstrip(section));
+        strcpy(section, strlwc(section));
+        sta = LINE_SECTION ;
+    } else if (sscanf (line, "%[^=] = \"%[^\"]\"", key, value) == 2
+           ||  sscanf (line, "%[^=] = '%[^\']'",   key, value) == 2
+           ||  sscanf (line, "%[^=] = %[^;#]",     key, value) == 2) {
+        /* Usual key=value, with or without comments */
+        strcpy(key, strstrip(key));
+        strcpy(key, strlwc(key));
+        strcpy(value, strstrip(value));
+        /*
+         * sscanf cannot handle '' or "" as empty values
+         * this is done here
+         */
+        if (!strcmp(value, "\"\"") || (!strcmp(value, "''"))) {
+            value[0]=0 ;
+        }
+        sta = LINE_VALUE ;
+    } else if (sscanf(line, "%[^=] = %[;#]", key, value)==2
+           ||  sscanf(line, "%[^=] %[=]", key, value) == 2) {
+        /*
+         * Special cases:
+         * key=
+         * key=;
+         * key=#
+         */
+        strcpy(key, strstrip(key));
+        strcpy(key, strlwc(key));
+        value[0]=0 ;
+        sta = LINE_VALUE ;
+    } else {
+        /* Generate syntax error */
+        sta = LINE_ERROR ;
+        printf("===== > %s   ===> %s\n", input_line, line);
+    }
+    return sta ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Parse an ini file and return an allocated dictionary object
+  @param    ininame Name of the ini file to read.
+  @return   Pointer to newly allocated dictionary
+
+  This is the parser for ini files. This function is called, providing
+  the name of the file to be read. It returns a dictionary object that
+  should not be accessed directly, but through accessor functions
+  instead.
+
+  The returned dictionary must be freed using iniparser_freedict().
+ */
+/*--------------------------------------------------------------------------*/
+dictionary * iniparser_load(const char * ininame)
+{
+    FILE * in ;
+
+    char line    [ASCIILINESZ+1] ;
+    char section [ASCIILINESZ+1] ;
+    char key     [ASCIILINESZ+1] ;
+    char tmp     [ASCIILINESZ+1] ;
+    char val     [ASCIILINESZ+1] ;
+
+    int  last=0 ;
+    int  len ;
+    int  lineno=0 ;
+    int  errs=0;
+
+    dictionary * dict ;
+
+    if ((in=fopen(ininame, "r"))==NULL) {
+        fprintf(stderr, "iniparser: cannot open %s\n", ininame);
+        return NULL ;
+    }
+
+    dict = dictionary_new(0) ;
+    if (!dict) {
+        fclose(in);
+        return NULL ;
+    }
+
+    memset(line,    0, ASCIILINESZ);
+    memset(section, 0, ASCIILINESZ);
+    memset(key,     0, ASCIILINESZ);
+    memset(val,     0, ASCIILINESZ);
+    last=0 ;
+
+    while (fgets(line+last, ASCIILINESZ-last, in)!=NULL) {
+        lineno++ ;
+        len = (int)strlen(line)-1;
+        if (len==0)
+            continue;
+        /* Safety check against buffer overflows */
+        if (line[len]!='\n') {
+            fprintf(stderr,
+                    "iniparser: input line too long in %s (%d)\n",
+                    ininame,
+                    lineno);
+            dictionary_del(dict);
+            fclose(in);
+            return NULL ;
+        }
+        /* Get rid of \n and spaces at end of line */
+        while ((len>=0) &&
+                ((line[len]=='\n') || (isspace(line[len])))) {
+            line[len]=0 ;
+            len-- ;
+        }
+        /* Detect multi-line */
+        if (line[len]=='\\') {
+            /* Multi-line value */
+            last=len ;
+            continue ;
+        } else {
+            last=0 ;
+        }
+        switch (iniparser_line(line, section, key, val)) {
+            case LINE_EMPTY:
+            case LINE_COMMENT:
+            break ;
+
+            case LINE_SECTION:
+            errs = dictionary_set(dict, section, NULL);
+            break ;
+
+            case LINE_VALUE:
+            sprintf(tmp, "%s:%s", section, key);
+            errs = dictionary_set(dict, tmp, val) ;
+            break ;
+
+            case LINE_ERROR:
+            fprintf(stderr, "iniparser: syntax error in %s (%d):\n",
+                    ininame,
+                    lineno);
+            fprintf(stderr, "-> %s\n", line);
+            errs++ ;
+            break;
+
+            default:
+            break ;
+        }
+        memset(line, 0, ASCIILINESZ);
+        last=0;
+        if (errs<0) {
+            fprintf(stderr, "iniparser: memory allocation failure\n");
+            break ;
+        }
+    }
+    if (errs) {
+        dictionary_del(dict);
+        dict = NULL ;
+    }
+    fclose(in);
+    return dict ;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+  @brief    Free all memory associated to an ini dictionary
+  @param    d Dictionary to free
+  @return   void
+
+  Free all memory associated to an ini dictionary.
+  It is mandatory to call this function before the dictionary object
+  gets out of the current context.
+ */
+/*--------------------------------------------------------------------------*/
+void iniparser_freedict(dictionary * d)
+{
+    dictionary_del(d);
+}
+
+/* vim: set ts=4 et sw=4 tw=75 */
diff --git a/thirdparty/SZ/sz/src/pastri.c b/thirdparty/SZ/sz/src/pastri.c
new file mode 100644
index 0000000000000000000000000000000000000000..7c6908b5f35360351c74bd994e4cf540cf54953c
--- /dev/null
+++ b/thirdparty/SZ/sz/src/pastri.c
@@ -0,0 +1,87 @@
+#include "pastri.h"
+#include "pastriD.h"
+#include "pastriF.h"
+
+void SZ_pastriReadParameters(char paramsFilename[512],pastri_params *paramsPtr){
+  FILE *paramsF;
+  paramsF=fopen(paramsFilename,"r");
+  
+  if(paramsF==NULL){
+    printf("ERROR: Parameters file cannot be opened.\n");
+    printf("Filename: %s\n",paramsFilename);
+    assert(0);
+  }
+  
+  fscanf(paramsF,"%d %d %d %d %lf %d %d",&paramsPtr->bf[0],&paramsPtr->bf[1],&paramsPtr->bf[2],&paramsPtr->bf[3],&paramsPtr->originalEb,&paramsPtr->dataSize,&paramsPtr->numBlocks);
+  //printf("Params: %d %d %d %d %.3e %d\n",paramsPtr->bf[0],paramsPtr->bf[1],paramsPtr->bf[2],paramsPtr->bf[3],paramsPtr->originalEb,paramsPtr->numBlocks);
+  fclose(paramsF);
+}
+
+void SZ_pastriPreprocessParameters(pastri_params *p){
+  //Preprocess by calculating some pastri_params:
+  //Calculate sbSize, sbNum, etc.:
+  p->idxRange[0]=(p->bf[0]+1)*(p->bf[0]+2)/2;
+  p->idxRange[1]=(p->bf[1]+1)*(p->bf[1]+2)/2;
+  p->idxRange[2]=(p->bf[2]+1)*(p->bf[2]+2)/2;
+  p->idxRange[3]=(p->bf[3]+1)*(p->bf[3]+2)/2;
+  p->sbSize=p->idxRange[2]*p->idxRange[3];
+  p->sbNum=p->idxRange[0]*p->idxRange[1];
+  p->bSize=p->sbSize*p->sbNum;
+  p->usedEb=p->originalEb*0.999;  //This is needed just to eliminate some rounding errors. It has almost no effect on compression rate/ratios.
+}
+
+void SZ_pastriCompressBatch(pastri_params *p,unsigned char *originalBuf, unsigned char** compressedBufP,size_t *compressedBytes){
+  (*compressedBufP) = (unsigned char*)calloc(p->numBlocks*p->bSize*p->dataSize,sizeof(char));
+  int bytes; //bytes for this block
+  int i;
+  size_t bytePos=0; //Current byte pos in the outBuf
+  
+  memcpy(*compressedBufP, p, sizeof(pastri_params));
+  bytePos+=sizeof(pastri_params);
+  
+  for(i=0;i<p->numBlocks;i++){
+    if(p->dataSize==8){
+      pastri_double_Compress(originalBuf + (i*p->bSize*p->dataSize),p,(*compressedBufP) + bytePos,&bytes);
+    }else if(p->dataSize==4){
+      pastri_float_Compress(originalBuf + (i*p->bSize*p->dataSize),p,(*compressedBufP) + bytePos,&bytes);
+    }
+    bytePos+=bytes;
+    //printf("bytes:%d\n",bytes);
+  }
+  *compressedBytes=bytePos;
+  //printf("totalBytesWritten:%d\n",*compressedBytes);
+}
+
+void SZ_pastriDecompressBatch(unsigned char*compressedBuf, pastri_params *p, unsigned char** decompressedBufP ,size_t *decompressedBytes){
+  int bytePos=0; //Current byte pos in the outBuf 
+  memcpy(p, compressedBuf, sizeof(pastri_params));
+  bytePos+=sizeof(pastri_params);	
+	
+  (*decompressedBufP) = (unsigned char*)malloc(p->numBlocks*p->bSize*p->dataSize*sizeof(char)); 
+  int bytes; //bytes for this block
+  int i;
+  
+  for(i=0;i<p->numBlocks;i++){
+    if(p->dataSize==8){
+      pastri_double_Decompress(compressedBuf + bytePos,p->dataSize,p,(*decompressedBufP) + (i*p->bSize*p->dataSize),&bytes);
+    }else if(p->dataSize==4){
+      pastri_float_Decompress(compressedBuf + bytePos,p->dataSize,p,(*decompressedBufP) + (i*p->bSize*p->dataSize),&bytes);
+    }
+          
+    bytePos += bytes;
+    //printf("bytes:%d\n",bytes);
+  }
+  //printf("totalBytesRead:%d\n",bytePos);
+  *decompressedBytes=p->numBlocks*p->bSize*p->dataSize;
+}
+
+void SZ_pastriCheckBatch(pastri_params *p,unsigned char*originalBuf,unsigned char*decompressedBuf){        
+  int i;
+  for(i=0;i<p->numBlocks;i++){
+    if(p->dataSize==8){
+      pastri_double_Check(originalBuf+(i*p->bSize*p->dataSize),p->dataSize,decompressedBuf+(i*p->bSize*p->dataSize),p);
+    }else if(p->dataSize==4){
+      pastri_float_Check(originalBuf+(i*p->bSize*p->dataSize),p->dataSize,decompressedBuf+(i*p->bSize*p->dataSize),p);
+    }
+  }
+}
diff --git a/thirdparty/SZ/sz/src/rw.c b/thirdparty/SZ/sz/src/rw.c
new file mode 100644
index 0000000000000000000000000000000000000000..91a1d6bc34e9b984df9298f56d5f763e319f588a
--- /dev/null
+++ b/thirdparty/SZ/sz/src/rw.c
@@ -0,0 +1,1068 @@
+/**
+ *  @file rw.c
+ *  @author Sheng Di
+ *  @date April, 2015
+ *  @brief io interface for fortrance
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "rw.h"
+#include "sz.h"
+
+int checkFileExistance(char* filePath)
+{
+	if( access( filePath, F_OK ) != -1 ) {
+		// file exists
+		return 1;
+	} else {
+		// file doesn't exist
+		return 0;
+	}	
+}
+
+float** create2DArray_float(size_t m, size_t n)
+{
+	size_t i=0;
+	float **data = (float**)malloc(sizeof(float*)*m);
+	for(i=0;i<m;i++)
+		data[i] = (float*)malloc(sizeof(float)*n);
+	return data;
+}
+
+void free2DArray_float(float** data, size_t m)
+{
+	size_t i = 0;
+	for(i=0;i<m;i++)
+		free(data[i]);
+	free(data);	
+}
+
+float*** create3DArray_float(size_t p, size_t m, size_t n)
+{
+	size_t i = 0, j = 0;
+	float ***data = (float***)malloc(sizeof(float**)*m);
+	for(i=0;i<p;i++)
+	{
+		data[i] = (float**)malloc(sizeof(float*)*n);
+		for(j=0;j<m;j++)
+			data[i][j] = (float*)malloc(sizeof(float)*n);
+	}
+	return data;
+}
+
+void free3DArray_float(float*** data, size_t p, size_t m)
+{
+	size_t i,j;
+	for(i=0;i<p;i++)
+	{
+		for(j=0;j<m;j++)
+			free(data[i][j]);
+		free(data[i]);
+	}
+	free(data);	
+}
+
+double** create2DArray_double(size_t m, size_t n)
+{
+	size_t i=0;
+	double **data = (double**)malloc(sizeof(double*)*m);
+	for(i=0;i<m;i++)
+			data[i] = (double*)malloc(sizeof(double)*n);
+			
+	return data;
+}
+
+void free2DArray_double(double** data, size_t m)
+{
+	size_t i;
+	for(i=0;i<m;i++)
+		free(data[i]);
+	free(data);	
+}
+
+double*** create3DArray_double(size_t p, size_t m, size_t n)
+{
+	size_t i = 0, j = 0;
+	double ***data = (double***)malloc(sizeof(double**)*m);
+	for(i=0;i<p;i++)
+	{
+		data[i] = (double**)malloc(sizeof(double*)*n);
+		for(j=0;j<m;j++)
+			data[i][j] = (double*)malloc(sizeof(double)*n);
+	}
+	return data;
+}
+
+void free3DArray_double(double*** data, size_t p, size_t m)
+{
+	size_t i,j;
+	for(i=0;i<p;i++)
+	{
+		for(j=0;j<m;j++)
+			free(data[i][j]);
+		free(data[i]);
+	}
+	free(data);	
+}
+
+size_t checkFileSize(char *srcFilePath, int *status)
+{
+	size_t filesize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return -1;
+	}
+	fseek(pFile, 0, SEEK_END);
+    filesize = ftell(pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return filesize;
+}
+
+unsigned char *readByteData(char *srcFilePath, size_t *byteLength, int *status)
+{
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return 0;
+    }
+	fseek(pFile, 0, SEEK_END);
+    *byteLength = ftell(pFile);
+    fclose(pFile);
+    
+    unsigned char *byteBuf = ( unsigned char *)malloc((*byteLength)*sizeof(unsigned char)); //sizeof(char)==1
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return 0;
+    }
+    fread(byteBuf, 1, *byteLength, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return byteBuf;
+}
+
+double *readDoubleData(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		double *daBuf = readDoubleData_systemEndian(srcFilePath, nbEle,&state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state==SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		double *daBuf = (double *)malloc(byteLength);
+		*nbEle = byteLength/8;
+		
+		ldouble buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*8;
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+
+int8_t *readInt8Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	int8_t *daBuf = readInt8Data_systemEndian(srcFilePath, nbEle, &state);
+	*status = state;
+	return daBuf;
+}
+
+int16_t *readInt16Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		int16_t *daBuf = readInt16Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		int16_t *daBuf = (int16_t *)malloc(byteLength);
+		*nbEle = byteLength/2;
+
+		lint16 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 1;//*2
+			memcpy(buf.byte, bytes+j, 2);
+			symTransform_2bytes(buf.byte);
+			daBuf[i] = buf.svalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+uint16_t *readUInt16Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		uint16_t *daBuf = readUInt16Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		uint16_t *daBuf = (uint16_t *)malloc(byteLength);
+		*nbEle = byteLength/2;
+
+		lint16 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 1;//*2
+			memcpy(buf.byte, bytes+j, 2);
+			symTransform_2bytes(buf.byte);
+			daBuf[i] = buf.usvalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+int32_t *readInt32Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		int32_t *daBuf = readInt32Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		int32_t *daBuf = (int32_t *)malloc(byteLength);
+		*nbEle = byteLength/4;
+
+		lint32 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransform_4bytes(buf.byte);
+			daBuf[i] = buf.ivalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+uint32_t *readUInt32Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		uint32_t *daBuf = readUInt32Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		uint32_t *daBuf = (uint32_t *)malloc(byteLength);
+		*nbEle = byteLength/4;
+
+		lint32 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 2; //*4
+			memcpy(buf.byte, bytes+j, 4);
+			symTransform_4bytes(buf.byte);
+			daBuf[i] = buf.uivalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+int64_t *readInt64Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		int64_t *daBuf = readInt64Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		int64_t *daBuf = (int64_t *)malloc(byteLength);
+		*nbEle = byteLength/8;
+
+		lint64 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 3; //*8
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.lvalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+uint64_t *readUInt64Data(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		uint64_t *daBuf = readUInt64Data_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		uint64_t *daBuf = (uint64_t *)malloc(byteLength);
+		*nbEle = byteLength/8;
+
+		lint64 buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i << 3; //*8
+			memcpy(buf.byte, bytes+j, 8);
+			symTransform_8bytes(buf.byte);
+			daBuf[i] = buf.ulvalue;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+
+float *readFloatData(char *srcFilePath, size_t *nbEle, int *status)
+{
+	int state = SZ_SCES;
+	if(dataEndianType==sysEndianType)
+	{
+		float *daBuf = readFloatData_systemEndian(srcFilePath, nbEle, &state);
+		*status = state;
+		return daBuf;
+	}
+	else
+	{
+		size_t i,j;
+		
+		size_t byteLength;
+		unsigned char* bytes = readByteData(srcFilePath, &byteLength, &state);
+		if(state == SZ_FERR)
+		{
+			*status = SZ_FERR;
+			return NULL;
+		}
+		float *daBuf = (float *)malloc(byteLength);
+		*nbEle = byteLength/4;
+		
+		lfloat buf;
+		for(i = 0;i<*nbEle;i++)
+		{
+			j = i*4;
+			memcpy(buf.byte, bytes+j, 4);
+			symTransform_4bytes(buf.byte);
+			daBuf[i] = buf.value;
+		}
+		free(bytes);
+		return daBuf;
+	}
+}
+
+double *readDoubleData_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/8; //only support double in this version
+    fclose(pFile);
+    
+    double *daBuf = (double *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+    fread(daBuf, 8, *nbEle, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return daBuf;
+}
+
+
+int8_t *readInt8Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize;
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int8_t *daBuf = (int8_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 1, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;
+}
+
+
+int16_t *readInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/2; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int16_t *daBuf = (int16_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 2, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+uint16_t *readUInt16Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/2; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	uint16_t *daBuf = (uint16_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 2, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+int32_t *readInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/4; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int32_t *daBuf = (int32_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 4, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+uint32_t *readUInt32Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/4; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	uint32_t *daBuf = (uint32_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 4, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;	
+}
+
+int64_t *readInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/8; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	int64_t *daBuf = (int64_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 8, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;
+}
+
+uint64_t *readUInt64Data_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 1\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fseek(pFile, 0, SEEK_END);
+	inSize = ftell(pFile);
+	*nbEle = inSize/8; 
+	fclose(pFile);
+
+	if(inSize<=0)
+	{
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+
+	uint64_t *daBuf = (uint64_t *)malloc(inSize);
+
+	pFile = fopen(srcFilePath, "rb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 2\n");
+		*status = SZ_FERR;
+		return NULL;
+	}
+	fread(daBuf, 8, *nbEle, pFile);
+	fclose(pFile);
+	*status = SZ_SCES;
+	return daBuf;
+}
+
+float *readFloatData_systemEndian(char *srcFilePath, size_t *nbEle, int *status)
+{
+	size_t inSize;
+	FILE *pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 1\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+	fseek(pFile, 0, SEEK_END);
+    inSize = ftell(pFile);
+    *nbEle = inSize/4; 
+    fclose(pFile);
+    
+    if(inSize<=0)
+    {
+		printf("Error: input file is wrong!\n");
+		*status = SZ_FERR;
+	}
+    
+    float *daBuf = (float *)malloc(inSize);
+    
+    pFile = fopen(srcFilePath, "rb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 2\n");
+        *status = SZ_FERR;
+        return NULL;
+    }
+    fread(daBuf, 4, *nbEle, pFile);
+    fclose(pFile);
+    *status = SZ_SCES;
+    return daBuf;
+}
+
+void writeByteData(unsigned char *bytes, size_t byteLength, char *tgtFilePath, int *status)
+{
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = SZ_FERR;
+        return;
+    }
+    
+    fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
+    fclose(pFile);
+    *status = SZ_SCES;
+}
+
+void writeDoubleData(double *data, size_t nbEle, char *tgtFilePath, int *status)
+{
+	size_t i = 0;
+	char s[64];
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = SZ_FERR;
+        return;
+    }
+    
+    for(i = 0;i<nbEle;i++)
+	{
+		sprintf(s,"%.20G\n",data[i]);
+		fputs(s, pFile);
+	}
+    
+    fclose(pFile);
+    *status = SZ_SCES;
+}
+
+void writeFloatData(float *data, size_t nbEle, char *tgtFilePath, int *status)
+{
+	size_t i = 0;
+	char s[64];
+	FILE *pFile = fopen(tgtFilePath, "wb");
+    if (pFile == NULL)
+    {
+        printf("Failed to open input file. 3\n");
+        *status = SZ_FERR;
+        return;
+    }
+   
+    for(i = 0;i<nbEle;i++)
+	{
+		//printf("i=%d\n",i);
+		//printf("data[i]=%f\n",data[i]);
+		sprintf(s,"%.30G\n",data[i]);
+		fputs(s, pFile);
+	}
+    
+    fclose(pFile);
+    *status = SZ_SCES;
+}
+
+void writeData(void *data, int dataType, size_t nbEle, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	if(dataType == SZ_FLOAT)
+	{
+		float* dataArray = (float *)data;
+		writeFloatData(dataArray, nbEle, tgtFilePath, &state);
+	}
+	else if(dataType == SZ_DOUBLE)
+	{
+		double* dataArray = (double *)data;
+		writeDoubleData(dataArray, nbEle, tgtFilePath, &state);	
+	}
+	else
+	{
+		printf("Error: data type cannot be the types other than SZ_FLOAT or SZ_DOUBLE\n");
+		*status = SZ_TERR; //wrong type
+		return;
+	}
+	*status = state;
+}
+
+void writeFloatData_inBytes(float *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0; 
+	int state = SZ_SCES;
+	lfloat buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(float));
+	for(i=0;i<nbEle;i++)
+	{
+		buf.value = data[i];
+		bytes[i*4+0] = buf.byte[0];
+		bytes[i*4+1] = buf.byte[1];
+		bytes[i*4+2] = buf.byte[2];
+		bytes[i*4+3] = buf.byte[3];					
+	}
+
+	size_t byteLength = nbEle*sizeof(float);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeDoubleData_inBytes(double *data, size_t nbEle, char* tgtFilePath, int *status)
+{
+	size_t i = 0, index = 0; 
+	int state = SZ_SCES;
+	ldouble buf;
+	unsigned char* bytes = (unsigned char*)malloc(nbEle*sizeof(double));
+	for(i=0;i<nbEle;i++)
+	{
+		index = i*8;
+		buf.value = data[i];
+		bytes[index+0] = buf.byte[0];
+		bytes[index+1] = buf.byte[1];
+		bytes[index+2] = buf.byte[2];
+		bytes[index+3] = buf.byte[3];
+		bytes[index+4] = buf.byte[4];
+		bytes[index+5] = buf.byte[5];
+		bytes[index+6] = buf.byte[6];
+		bytes[index+7] = buf.byte[7];
+	}
+
+	size_t byteLength = nbEle*sizeof(double);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeShortData_inBytes(short *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*2;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertShortArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeUShortData_inBytes(unsigned short *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*2;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertUShortArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeIntData_inBytes(int *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*4;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertIntArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeUIntData_inBytes(unsigned int *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*4;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertUIntArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeLongData_inBytes(int64_t *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*8;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertLongArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+void writeULongData_inBytes(uint64_t *states, size_t stateLength, char *tgtFilePath, int *status)
+{
+	int state = SZ_SCES;
+	size_t byteLength = stateLength*8;
+	unsigned char* bytes = (unsigned char*)malloc(byteLength*sizeof(char));
+	convertULongArrayToBytes(states, stateLength, bytes);
+	writeByteData(bytes, byteLength, tgtFilePath, &state);
+	free(bytes);
+	*status = state;
+}
+
+unsigned short* readShortData(char *srcFilePath, size_t *dataLength, int *status)
+{
+	size_t byteLength = 0; 
+	int state = SZ_SCES;
+	unsigned char * bytes = readByteData(srcFilePath, &byteLength, &state);
+	*dataLength = byteLength/2;
+	unsigned short* states = convertByteDataToUShortArray(bytes, byteLength);
+	free(bytes);
+	*status = state;
+	return states;
+}
+
+void writeStrings(int nbStr, char *str[], char *tgtFilePath, int *status)
+{
+	size_t i = 0;
+	char s[256];
+	FILE *pFile = fopen(tgtFilePath, "wb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 3\n");
+		*status = SZ_FERR;
+		return;
+	}
+
+	for(i = 0;i<nbStr;i++)
+	{
+		sprintf(s,"%s\n",str[i]);
+		fputs(s, pFile);
+	}
+
+	fclose(pFile);
+	*status = SZ_SCES;
+}
+
+/*
+//@deprecated
+//binToPFM_float is to convert the floating-point data to PFM supported by Jpeg XT
+//But wrong version!
+//In order to do the conversion, we need to use https://github.com/thorfdbg/difftest_ng according to Thomas Richter.
+
+
+void convertToPFM_float(float *data, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, int endianType, char *tgtFilePath, int *status)
+{
+	size_t i, nbEle = computeDataLength(r5, r4, r3, r2, r1);
+	int dim = computeDimension(r5, r4, r3, r2, r1);
+	
+	FILE *pFile = fopen(tgtFilePath, "wb");
+	if (pFile == NULL)
+	{
+		printf("Failed to open input file. 3\n");
+		*status = SZ_NSCS;
+		return;
+	}	
+	fputs("PF\n", pFile);
+	char strBuf[256];
+	switch(dim)
+	{
+	case 1: 
+		sprintf(strBuf, "%zu\n", r1);
+		break;
+	case 2:
+		sprintf(strBuf, "%zu %zu\n", r1, r2);
+		break;
+	case 3:
+		sprintf(strBuf, "%zu %zu %zu\n", r1, r2, r3);
+		break;
+	case 4:
+		sprintf(strBuf, "%zu %zu %zu %zu\n", r1, r2, r3, r4);
+		break;
+	case 5:
+		sprintf(strBuf, "%zu %zu %zu %zu %zu\n", r1, r2, r3, r4, r5);
+		break;
+	}
+	fputs(strBuf, pFile);
+	if(endianType==LITTLE_ENDIAN)
+		fputs("-1.0\n", pFile);
+	else
+		fputs("1.0\n", pFile);
+
+	size_t byteLength = nbEle*sizeof(float);	
+	lfloat buf;	
+	unsigned char* bytes = (unsigned char*)malloc(byteLength);
+	for(i=0;i<nbEle;i++)
+	{
+		buf.value = data[i];
+		bytes[i*4+0] = buf.byte[0];
+		bytes[i*4+1] = buf.byte[1];
+		bytes[i*4+2] = buf.byte[2];
+		bytes[i*4+3] = buf.byte[3];
+	}
+	
+	fwrite(bytes, 1, byteLength, pFile); //write outSize bytes
+	fclose(pFile);
+	
+	free(bytes);
+	*status = SZ_SCES;
+}*/
diff --git a/thirdparty/SZ/sz/src/rwf.c b/thirdparty/SZ/sz/src/rwf.c
new file mode 100644
index 0000000000000000000000000000000000000000..17e0fb4b40d31af385e3fdda0ee72f178605ec15
--- /dev/null
+++ b/thirdparty/SZ/sz/src/rwf.c
@@ -0,0 +1,96 @@
+/**
+ *  @file rw.c
+ *  @author Sheng Di
+ *  @date April, 2015
+ *  @brief io interface for fortrance
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "rw.h"
+
+void checkfilesizec_(char *srcFilePath, int *len, size_t *filesize)
+{
+	int i; 
+	int status;
+	char s[*len+1];
+	for(i=0;i<*len;i++)
+		s[i]=srcFilePath[i];
+	s[*len]='\0';
+	*filesize = checkFileSize(s, &status);
+}
+
+void readbytefile_(char *srcFilePath, int *len, unsigned char *bytes, size_t *byteLength)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=srcFilePath[i];
+    s[*len]='\0';
+    unsigned char *tmp_bytes = readByteData(s, byteLength, &ierr);
+    memcpy(bytes, tmp_bytes, *byteLength);
+    free(tmp_bytes);
+}
+
+void readdoublefile_(char *srcFilePath, int *len, double *data, size_t *nbEle)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=srcFilePath[i];
+    s[*len]='\0';	
+	double *tmp_data = readDoubleData(s, nbEle, &ierr);
+	memcpy(data, tmp_data, *nbEle);
+	free(tmp_data);
+}
+
+void readfloatfile_(char *srcFilePath, int *len, float *data, size_t *nbEle)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=srcFilePath[i];
+    s[*len]='\0';
+	float *tmp_data = readFloatData(s, nbEle, &ierr);
+	memcpy(data, tmp_data, *nbEle);
+	free(tmp_data);
+}
+
+void writebytefile_(unsigned char *bytes, size_t *byteLength, char *tgtFilePath, int *len)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=tgtFilePath[i];
+    s[*len]='\0';
+	writeByteData(bytes, *byteLength, s, &ierr);
+}
+
+void writedoublefile_(double *data, size_t *nbEle, char *tgtFilePath, int *len)
+{
+	size_t i;
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=tgtFilePath[i];
+    s[*len]='\0';	
+	writeDoubleData(data, *nbEle, s, &ierr);
+}
+
+void writefloatfile_(float *data, size_t *nbEle, char *tgtFilePath, int *len)
+{
+	size_t i; 
+	int ierr;
+    char s[*len+1];
+    for(i=0;i<*len;i++)
+        s[i]=tgtFilePath[i];
+    s[*len]='\0';
+	writeFloatData(data, *nbEle, s, &ierr);
+}
diff --git a/thirdparty/SZ/sz/src/sz.c b/thirdparty/SZ/sz/src/sz.c
new file mode 100644
index 0000000000000000000000000000000000000000..97cb00d8a5725090914964e92229bff939cdb1e5
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz.c
@@ -0,0 +1,1003 @@
+/**
+ *  @file sz.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief SZ_Init, Compression and Decompression functions
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageD.h"
+#include "TightDataPointStorageF.h"
+#include "zlib.h"
+#include "rw.h"
+#include "Huffman.h"
+#include "conf.h"
+//#include "CurveFillingCompressStorage.h"
+
+int versionNumber[4] = {SZ_VER_MAJOR,SZ_VER_MINOR,SZ_VER_BUILD,SZ_VER_REVISION};
+//int SZ_SIZE_TYPE = 8;
+
+int dataEndianType = LITTLE_ENDIAN_DATA; //*endian type of the data read from disk
+int sysEndianType; //*sysEndianType is actually set automatically.
+
+//the confparams should be separate between compression and decopmression, in case of mutual-affection when calling compression/decompression alternatively
+sz_params *confparams_cpr = NULL; //used for compression
+sz_params *confparams_dec = NULL; //used for decompression 
+
+sz_exedata *exe_params = NULL;
+
+/*following global variables are desgined for time-series based compression*/
+/*sz_varset is not used in the single-snapshot data compression*/
+SZ_VarSet* sz_varset = NULL;
+sz_multisteps *multisteps = NULL;
+sz_tsc_metadata *sz_tsc = NULL;
+
+//only for Pastri compressor
+#ifdef PASTRI
+pastri_params pastri_par;
+#endif
+
+HuffmanTree* SZ_Reset()
+{
+	return createDefaultHuffmanTree();
+}
+
+int SZ_Init(const char *configFilePath)
+{
+	int loadFileResult = SZ_LoadConf(configFilePath);
+	if(loadFileResult==SZ_NSCS)
+		return SZ_NSCS;
+	
+	exe_params->SZ_SIZE_TYPE = sizeof(size_t);
+	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		initSZ_TSC();
+	}
+	return SZ_SCES;
+}
+
+int SZ_Init_Params(sz_params *params)
+{
+	int x = 1;
+	char *y = (char*)&x;
+	int endianType = BIG_ENDIAN_SYSTEM;
+	if(*y==1) endianType = LITTLE_ENDIAN_SYSTEM;
+
+	sysEndianType = endianType;
+	exe_params->SZ_SIZE_TYPE = sizeof(size_t);
+
+	// set default values
+	if(params->max_quant_intervals > 0) 
+		params->maxRangeRadius = params->max_quant_intervals/2;
+	else
+		params->max_quant_intervals = params->maxRangeRadius*2;
+
+	exe_params->intvCapacity = params->maxRangeRadius*2;
+	exe_params->intvRadius = params->maxRangeRadius;
+
+	if(params->quantization_intervals>0)
+	{
+		updateQuantizationInfo(params->quantization_intervals);
+		exe_params->optQuantMode = 0;
+	}
+	else
+		exe_params->optQuantMode = 1;
+
+
+	if(params->quantization_intervals%2!=0)
+	{
+		printf("Error: quantization_intervals must be an even number!\n");
+		return SZ_NSCS;
+	}
+
+	confparams_cpr = (sz_params*)malloc(sizeof(sz_params));
+	memcpy(confparams_cpr, params, sizeof(sz_params));	
+
+	return SZ_SCES;
+}
+
+int computeDimension(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	int dimension;
+	if(r1==0) 
+	{
+		dimension = 0;
+	}
+	else if(r2==0) 
+	{
+		dimension = 1;
+	}
+	else if(r3==0) 
+	{
+		dimension = 2;
+	}
+	else if(r4==0) 
+	{
+		dimension = 3;
+	}
+	else if(r5==0) 
+	{
+		dimension = 4;
+	}
+	else 
+	{
+		dimension = 5;
+	}
+	return dimension;	
+}
+
+size_t computeDataLength(size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	size_t dataLength;
+	if(r1==0) 
+	{
+		dataLength = 0;
+	}
+	else if(r2==0) 
+	{
+		dataLength = r1;
+	}
+	else if(r3==0) 
+	{
+		dataLength = r1*r2;
+	}
+	else if(r4==0) 
+	{
+		dataLength = r1*r2*r3;
+	}
+	else if(r5==0) 
+	{
+		dataLength = r1*r2*r3*r4;
+	}
+	else 
+	{
+		dataLength = r1*r2*r3*r4*r5;
+	}
+	return dataLength;
+}
+
+/*-------------------------------------------------------------------------*/
+/**
+    @brief      Perform Compression 
+    @param      data           data to be compressed
+    @param      outSize        the size (in bytes) after compression
+    @param		r5,r4,r3,r2,r1	the sizes of each dimension (supporting only 5 dimensions at most in this version.
+    @return     compressed data (in binary stream) or NULL(0) if any errors
+
+ **/
+/*-------------------------------------------------------------------------*/
+unsigned char* SZ_compress_args(int dataType, void *data, size_t *outSize, int errBoundMode, double absErrBound, 
+double relBoundRatio, double pwrBoundRatio, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	//TODO
+	confparams_cpr->dataType = dataType;
+	if(dataType==SZ_FLOAT)
+	{
+		unsigned char *newByteData = NULL;
+		
+		SZ_compress_args_float(&newByteData, (float *)data, r5, r4, r3, r2, r1, 
+		outSize, errBoundMode, absErrBound, relBoundRatio, pwrBoundRatio);
+		
+		return newByteData;
+	}
+	else if(dataType==SZ_DOUBLE)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_double(&newByteData, (double *)data, r5, r4, r3, r2, r1, 
+		outSize, errBoundMode, absErrBound, relBoundRatio, pwrBoundRatio);
+		
+		return newByteData;
+	}
+	else if(dataType==SZ_INT64)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_int64(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;
+	}		
+	else if(dataType==SZ_INT32) //int type
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_int32(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;
+	}
+	else if(dataType==SZ_INT16)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_int16(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;		
+	}
+	else if(dataType==SZ_INT8)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_int8(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;
+	}
+	else if(dataType==SZ_UINT64)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_uint64(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;
+	}		
+	else if(dataType==SZ_UINT32) //int type
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_uint32(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;
+	}
+	else if(dataType==SZ_UINT16)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_uint16(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;		
+	}
+	else if(dataType==SZ_UINT8)
+	{
+		unsigned char *newByteData;
+		SZ_compress_args_uint8(&newByteData, data, r5, r4, r3, r2, r1, outSize, errBoundMode, absErrBound, relBoundRatio);
+		return newByteData;
+	} 	
+	else
+	{
+		printf("Error: dataType can only be SZ_FLOAT, SZ_DOUBLE, SZ_INT8/16/32/64 or SZ_UINT8/16/32/64.\n");
+		return NULL;
+	}
+}
+
+int SZ_compress_args2(int dataType, void *data, unsigned char* compressed_bytes, size_t *outSize, 
+int errBoundMode, double absErrBound, double relBoundRatio, double pwrBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	unsigned char* bytes = SZ_compress_args(dataType, data, outSize, errBoundMode, absErrBound, relBoundRatio, pwrBoundRatio, r5, r4, r3, r2, r1);
+    memcpy(compressed_bytes, bytes, *outSize);
+    free(bytes); 
+	return SZ_SCES;
+}
+
+int SZ_compress_args3(int dataType, void *data, unsigned char* compressed_bytes, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1)
+{
+	confparams_cpr->dataType = dataType;
+	if(dataType==SZ_FLOAT)
+	{
+		SZ_compress_args_float_subblock(compressed_bytes, (float *)data, 
+		r5, r4, r3, r2, r1,
+		s5, s4, s3, s2, s1,
+		e5, e4, e3, e2, e1,
+		outSize, errBoundMode, absErrBound, relBoundRatio);
+		
+		return SZ_SCES;
+	}
+	else if(dataType==SZ_DOUBLE)
+	{
+		SZ_compress_args_double_subblock(compressed_bytes, (double *)data, 
+		r5, r4, r3, r2, r1,
+		s5, s4, s3, s2, s1,
+		e5, e4, e3, e2, e1,
+		outSize, errBoundMode, absErrBound, relBoundRatio);
+		
+		return SZ_SCES;
+	}
+	else
+	{
+		printf("Error (in SZ_compress_args3): dataType can only be SZ_FLOAT or SZ_DOUBLE.\n");
+		return SZ_NSCS;
+	}	
+}
+
+unsigned char *SZ_compress(int dataType, void *data, size_t *outSize, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{	
+	unsigned char *newByteData = SZ_compress_args(dataType, data, outSize, confparams_cpr->errorBoundMode, confparams_cpr->absErrBound, confparams_cpr->relBoundRatio, 
+	confparams_cpr->pw_relBoundRatio, r5, r4, r3, r2, r1);
+	return newByteData;
+}
+
+//////////////////
+/*-------------------------------------------------------------------------*/
+/**
+    @brief      Perform Compression 
+    @param      data           data to be compressed
+    @param		reservedValue  the reserved value
+    @param      outSize        the size (in bytes) after compression
+    @param		r5,r4,r3,r2,r1	the sizes of each dimension (supporting only 5 dimensions at most in this version.
+    @return     compressed data (in binary stream)
+
+ **/
+/*-------------------------------------------------------------------------*/
+unsigned char *SZ_compress_rev_args(int dataType, void *data, void *reservedValue, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	unsigned char *newByteData;
+	//TODO
+	printf("SZ compression with reserved data is TO BE DONE LATER.\n");
+	exit(0);
+	
+	return newByteData;	
+}
+
+int SZ_compress_rev_args2(int dataType, void *data, void *reservedValue, unsigned char* compressed_bytes, size_t *outSize, int errBoundMode, double absErrBound, double relBoundRatio, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	confparams_cpr->dataType = dataType;
+	unsigned char* bytes = SZ_compress_rev_args(dataType, data, reservedValue, outSize, errBoundMode, absErrBound, relBoundRatio, r5, r4, r3, r2, r1);
+	memcpy(compressed_bytes, bytes, *outSize);
+	free(bytes); //free(bytes) is removed , because of dump error at MIRA system (PPC architecture), fixed?
+	return 0;
+}
+
+unsigned char *SZ_compress_rev(int dataType, void *data, void *reservedValue, size_t *outSize, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	unsigned char *newByteData;
+	//TODO
+	printf("SZ compression with reserved data is TO BE DONE LATER.\n");
+	exit(0);
+	
+	return newByteData;
+}
+
+void *SZ_decompress(int dataType, unsigned char *bytes, size_t byteLength, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	if(confparams_dec==NULL)
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+	memset(confparams_dec, 0, sizeof(sz_params));
+	if(exe_params==NULL)
+		exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+	memset(exe_params, 0, sizeof(sz_exedata));
+	
+	int x = 1;
+	char *y = (char*)&x;
+	if(*y==1)
+		sysEndianType = LITTLE_ENDIAN_SYSTEM;
+	else //=0
+		sysEndianType = BIG_ENDIAN_SYSTEM;
+	
+	if(dataType == SZ_FLOAT)
+	{
+		float *newFloatData;
+		SZ_decompress_args_float(&newFloatData, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newFloatData;	
+	}
+	else if(dataType == SZ_DOUBLE)
+	{
+		double *newDoubleData;
+		SZ_decompress_args_double(&newDoubleData, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newDoubleData;	
+	}
+	else if(dataType == SZ_INT8)
+	{
+		int8_t *newInt8Data;
+		SZ_decompress_args_int8(&newInt8Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newInt8Data;
+	}
+	else if(dataType == SZ_INT16)
+	{
+		int16_t *newInt16Data;
+		SZ_decompress_args_int16(&newInt16Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newInt16Data;
+	}
+	else if(dataType == SZ_INT32)
+	{
+		int32_t *newInt32Data;
+		SZ_decompress_args_int32(&newInt32Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newInt32Data;
+	}
+	else if(dataType == SZ_INT64)
+	{
+		int64_t *newInt64Data;
+		SZ_decompress_args_int64(&newInt64Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newInt64Data;
+	}
+	else if(dataType == SZ_UINT8)
+	{
+		uint8_t *newUInt8Data;
+		SZ_decompress_args_uint8(&newUInt8Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newUInt8Data;
+	}
+	else if(dataType == SZ_UINT16)
+	{
+		uint16_t *newUInt16Data;
+		SZ_decompress_args_uint16(&newUInt16Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newUInt16Data;
+	}
+	else if(dataType == SZ_UINT32)
+	{
+		uint32_t *newUInt32Data;
+		SZ_decompress_args_uint32(&newUInt32Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newUInt32Data;
+	}
+	else if(dataType == SZ_UINT64)
+	{
+		uint64_t *newUInt64Data;
+		SZ_decompress_args_uint64(&newUInt64Data, r5, r4, r3, r2, r1, bytes, byteLength);
+		return newUInt64Data;
+	}
+	else 
+	{
+		printf("Error: data type cannot be the types other than SZ_FLOAT or SZ_DOUBLE\n");
+		return NULL;	
+	}
+}
+
+/**
+ * 
+ * 
+ * return number of elements or -1 if any errors
+ * */
+size_t SZ_decompress_args(int dataType, unsigned char *bytes, size_t byteLength, void* decompressed_array, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	//size_t i;
+	size_t nbEle = computeDataLength(r5,r4,r3,r2,r1);
+	
+	if(dataType == SZ_FLOAT)
+	{
+		float* data = (float *)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		float* data_array = (float *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(float));
+		//for(i=0;i<nbEle;i++)
+		//	data_array[i] = data[i];	
+		free(data); //this free operation seems to not work with BlueG/Q system.	
+	}
+	else if (dataType == SZ_DOUBLE)
+	{
+		double* data = (double *)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		double* data_array = (double *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(double));
+		//for(i=0;i<nbEle;i++)
+		//	data_array[i] = data[i];
+		free(data); //this free operation seems to not work with BlueG/Q system.	
+	}
+	else if(dataType == SZ_INT8)
+	{
+		int8_t* data = (int8_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		int8_t* data_array = (int8_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(int8_t));
+		free(data);
+	}
+	else if(dataType == SZ_INT16)
+	{
+		int16_t* data = (int16_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		int16_t* data_array = (int16_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(int16_t));
+		free(data);	
+	}
+	else if(dataType == SZ_INT32)
+	{
+		int32_t* data = (int32_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		int32_t* data_array = (int32_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(int32_t));
+		free(data);	
+	}
+	else if(dataType == SZ_INT64)
+	{
+		int64_t* data = (int64_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		int64_t* data_array = (int64_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(int64_t));
+		free(data);		
+	}
+	else if(dataType == SZ_UINT8)
+	{
+		uint8_t* data = (uint8_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		uint8_t* data_array = (uint8_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(uint8_t));
+		free(data);
+	}
+	else if(dataType == SZ_UINT16)
+	{
+		uint16_t* data = (uint16_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		uint16_t* data_array = (uint16_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(uint16_t));
+		free(data);		
+	}
+	else if(dataType == SZ_UINT32)
+	{
+		uint32_t* data = (uint32_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		uint32_t* data_array = (uint32_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(uint32_t));
+		free(data);		
+	}
+	else if(dataType == SZ_UINT64)
+	{
+		uint64_t* data = (uint64_t*)SZ_decompress(dataType, bytes, byteLength, r5, r4, r3, r2, r1);
+		uint64_t* data_array = (uint64_t *)decompressed_array;
+		memcpy(data_array, data, nbEle*sizeof(uint64_t));
+		free(data);			
+	}
+	else
+	{ 
+		printf("Error: data type cannot be the types other than SZ_FLOAT or SZ_DOUBLE\n");
+		return SZ_NSCS; //indicating error		
+	}
+
+	return nbEle;
+}
+
+
+sz_metadata* SZ_getMetadata(unsigned char* bytes)
+{
+	int index = 0, i, isConstant, isLossless;
+	size_t dataSeriesLength = 0;
+	int versions[3] = {0,0,0};
+	for (i = 0; i < 3; i++)
+		versions[i] = bytes[index++]; //3
+	unsigned char sameRByte = bytes[index++]; //1
+	isConstant = sameRByte & 0x01;
+	//confparams_dec->szMode = (sameRByte & 0x06)>>1;
+	isLossless = (sameRByte & 0x10)>>4;
+	exe_params->SZ_SIZE_TYPE = ((sameRByte & 0x40)>>6)==1?8:4;
+	
+	sz_params* params = convertBytesToSZParams(&(bytes[index]));
+	if(confparams_dec!=NULL)
+		free(confparams_dec);
+	confparams_dec = params;	
+	index += MetaDataByteLength;
+	
+	if(params->dataType!=SZ_FLOAT && params->dataType!= SZ_DOUBLE) //if this type is an Int type
+		index++; //jump to the dataLength info byte address
+	dataSeriesLength = bytesToSize(&(bytes[index]));// 4 or 8	
+	index += exe_params->SZ_SIZE_TYPE;
+	index += 4; //max_quant_intervals
+	
+	sz_metadata* metadata = (sz_metadata*)malloc(sizeof(struct sz_metadata));
+	
+	metadata->versionNumber[0] = versions[0];
+	metadata->versionNumber[1] = versions[1];
+	metadata->versionNumber[2] = versions[2];
+	metadata->isConstant = isConstant;
+	metadata->isLossless = isLossless;
+	metadata->sizeType = exe_params->SZ_SIZE_TYPE;
+	metadata->dataSeriesLength = dataSeriesLength;
+	
+	metadata->conf_params = confparams_dec;
+	
+	int defactoNBBins = 0; //real # bins
+	if(isConstant==0 && isLossless==0)
+	{
+		int radExpoL = 0, segmentL = 0, pwrErrBoundBytesL = 0;
+		if(metadata->conf_params->errorBoundMode >= PW_REL)
+		{
+			radExpoL = 1;
+			segmentL = exe_params->SZ_SIZE_TYPE;
+			pwrErrBoundBytesL = 4;
+		}
+		
+		int offset_typearray = 3 + 1 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 4 + radExpoL + segmentL + pwrErrBoundBytesL + 4 + 4 + 1 + 8 
+				+ exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE + exe_params->SZ_SIZE_TYPE;
+		size_t nodeCount = bytesToInt_bigEndian(bytes+offset_typearray);
+		defactoNBBins = (nodeCount+1)/2;
+	}
+	
+	metadata->defactoNBBins = defactoNBBins;
+	return metadata;
+}
+
+void SZ_printMetadata(sz_metadata* metadata)
+{
+	printf("=================SZ Compression Meta Data=================\n");
+	printf("Version:                        \t %d.%d.%d\n", metadata->versionNumber[0], metadata->versionNumber[1], metadata->versionNumber[2]);
+	printf("Constant data?:                 \t %s\n", metadata->isConstant==1?"YES":"NO");
+	printf("Lossless?:                      \t %s\n", metadata->isLossless==1?"YES":"NO");
+	printf("Size type (size of # elements): \t %d bytes\n", metadata->sizeType); 
+	printf("Num of elements:                \t %zu\n", metadata->dataSeriesLength);
+		
+	sz_params* params = metadata->conf_params;
+	
+	switch(params->dataType)
+	{
+	case SZ_FLOAT:
+		printf("Data type:                      \t FLOAT\n");
+		break;
+	case SZ_DOUBLE:
+		printf("Data type:                      \t DOUBLE\n");
+		break;
+	case SZ_INT8:
+		printf("Data type:                      \t INT8\n");
+		break;	
+	case SZ_INT16:
+		printf("Data type:                      \t INT16\n");
+		break;
+	case SZ_INT32:
+		printf("Data type:                      \t INT32\n");
+		break;	
+	case SZ_INT64:
+		printf("Data type:                      \t INT64\n");
+		break;	
+	case SZ_UINT8:
+		printf("Data type:                      \t UINT8\n");
+		break;	
+	case SZ_UINT16:
+		printf("Data type:                      \t UINT16\n");
+		break;
+	case SZ_UINT32:
+		printf("Data type:                      \t UINT32\n");
+		break;	
+	case SZ_UINT64:
+		printf("Data type:                      \t UINT64\n");
+		break;				
+	}
+	
+	if(exe_params->optQuantMode==1)
+	{
+		printf("quantization_intervals:         \t 0\n");
+		printf("max_quant_intervals:            \t %d\n", params->max_quant_intervals);
+		printf("actual used # intervals:        \t %d\n", metadata->defactoNBBins);
+	}
+	else
+	{
+		printf("quantization_intervals:         \t %d\n", params->quantization_intervals);
+		printf("max_quant_intervals:            \t - %d\n", params->max_quant_intervals);		
+	}
+	
+	printf("dataEndianType (prior raw data):\t %s\n", dataEndianType==BIG_ENDIAN_DATA?"BIG_ENDIAN":"LITTLE_ENDIAN");
+	printf("sysEndianType (at compression): \t %s\n", sysEndianType==1?"BIG_ENDIAN":"LITTLE_ENDIAN");
+	printf("sampleDistance:                 \t %d\n", params->sampleDistance);
+	printf("predThreshold:                  \t %f\n", params->predThreshold);
+	switch(params->szMode)
+	{
+	case SZ_BEST_SPEED:
+		printf("szMode:                         \t SZ_BEST_SPEED (without Gzip)\n");
+		break;
+	case SZ_BEST_COMPRESSION:
+		printf("szMode:                         \t SZ_BEST_COMPRESSION (with Gzip)\n");
+		break;
+	}
+	switch(params->gzipMode)
+	{
+	case Z_BEST_SPEED:
+		printf("gzipMode:                       \t Z_BEST_SPEED\n");
+		break;
+	case Z_DEFAULT_COMPRESSION:
+		printf("gzipMode:                       \t Z_BEST_SPEED\n");
+		break;	
+	case Z_BEST_COMPRESSION:
+		printf("gzipMode:                       \t Z_BEST_COMPRESSION\n");
+		break;
+	}
+	
+	switch(params->errorBoundMode)
+	{
+	case ABS:
+		printf("errBoundMode:                   \t ABS\n");
+		printf("absErrBound:                    \t %f\n", params->absErrBound);
+		break;
+	case REL:
+		printf("errBoundMode:                   \t REL (based on value_range extent)\n");
+		printf("relBoundRatio:                  \t %f\n", params->relBoundRatio);
+		break;
+	case ABS_AND_REL:
+		printf("errBoundMode:                   \t ABS_AND_REL\n");
+		printf("absErrBound:                    \t %f\n", params->absErrBound);
+		printf("relBoundRatio:                  \t %f\n", params->relBoundRatio);
+		break;
+	case ABS_OR_REL:
+		printf("errBoundMode:                   \t ABS_OR_REL\n");
+		printf("absErrBound:                    \t %f\n", params->absErrBound);
+		printf("relBoundRatio:                  \t %f\n", params->relBoundRatio);
+		break;
+	case PSNR:
+		printf("errBoundMode:                   \t PSNR\n");
+		printf("psnr:                           \t %f\n", params->psnr);
+		break;
+	case PW_REL:
+		printf("errBoundMode:                   \t PW_REL\n");
+		break;
+	case ABS_AND_PW_REL:
+		printf("errBoundMode:                   \t ABS_AND_PW_REL\n");
+		printf("absErrBound:                    \t %f\n", params->absErrBound);
+		break;
+	case ABS_OR_PW_REL:
+		printf("errBoundMode:                   \t ABS_OR_PW_REL\n");
+		printf("absErrBound:                    \t %f\n", params->absErrBound);
+		break;
+	case REL_AND_PW_REL:
+		printf("errBoundMode:                   \t REL_AND_PW_REL\n");
+		printf("range_relBoundRatio:            \t %f\n", params->relBoundRatio);
+		break;
+	case REL_OR_PW_REL:
+		printf("errBoundMode:                   \t REL_OR_PW_REL\n");
+		printf("range_relBoundRatio:            \t %f\n", params->relBoundRatio);
+		break;
+	}
+	
+	if(params->errorBoundMode>=PW_REL && params->errorBoundMode<=REL_OR_PW_REL)
+	{
+		printf("pw_relBoundRatio:               \t %f\n", params->pw_relBoundRatio);
+		printf("segment_size:                   \t %d\n", params->segment_size);
+		switch(params->pwr_type)
+		{
+		case SZ_PWR_MIN_TYPE:
+			printf("pwrType:                    \t SZ_PWR_MIN_TYPE\n");
+			break;
+		case SZ_PWR_AVG_TYPE:
+			printf("pwrType:                    \t SZ_PWR_AVG_TYPE\n");
+			break;
+		case SZ_PWR_MAX_TYPE:
+			printf("pwrType:                    \t SZ_PWR_MAX_TYPE\n");
+			break;
+		}
+	}
+}
+
+/*-----------------------------------batch data compression--------------------------------------*/
+
+void filloutDimArray(size_t* dim, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	if(r2==0)
+		dim[0] = r1;
+	else if(r3==0)
+	{
+		dim[0] = r2;
+		dim[1] = r1;
+	}
+	else if(r4==0)
+	{
+		dim[0] = r3;
+		dim[1] = r2;
+		dim[2] = r1;
+	}
+	else if(r5==0)
+	{
+		dim[0] = r4;
+		dim[1] = r3;
+		dim[2] = r2;
+		dim[3] = r1;
+	}
+	else
+	{
+		dim[0] = r5;
+		dim[1] = r4;
+		dim[2] = r3;
+		dim[3] = r2;
+		dim[4] = r1;		
+	}
+}
+
+size_t compute_total_batch_size()
+{
+	size_t eleNum = 0, totalSize = 0;
+	SZ_Variable* p = sz_varset->header;
+	while(p->next!=NULL)
+	{
+		eleNum = computeDataLength(p->next->r5, p->next->r4, p->next->r3, p->next->r2, p->next->r1);
+		if(p->next->dataType==SZ_FLOAT)
+			totalSize += (eleNum*4);
+		else
+			totalSize += (eleNum*8);
+		p=p->next;
+	}
+	return totalSize;
+}
+
+int isZlibFormat(unsigned char magic1, unsigned char magic2)
+{
+	if(magic1==104&&magic2==5) //DC+BS
+		return 1;
+	if(magic1==104&&magic2==129) //DC+DC
+		return 1;
+	if(magic1==104&&magic2==222) //DC+BC
+		return 1;
+	if(magic1==120&&magic2==1) //BC+BS
+		return 1;
+	if(magic1==120&&magic2==156) //BC+DC
+		return 1;
+	if(magic1==120&&magic2==218) //BC+BS
+		return 1;
+	return 0;
+}
+
+void SZ_registerVar(char* varName, int dataType, void* data, 
+			int errBoundMode, double absErrBound, double relBoundRatio, double pwRelBoundRatio, 
+			size_t r5, size_t r4, size_t r3, size_t r2, size_t r1)
+{
+	if(sz_tsc==NULL)
+		initSZ_TSC();
+		
+	char str[256];
+	SZ_batchAddVar(varName, dataType, data, 
+			errBoundMode, absErrBound, relBoundRatio, pwRelBoundRatio, r5, r4, r3, r2, r1);
+	sprintf(str, "%d: %s : %zuX%zuX%zuX%zu%zu : %d : %f : %f : %f\n", sz_varset->count - 1, varName, r5, r4, r3, r2, r1, errBoundMode, absErrBound, relBoundRatio, pwRelBoundRatio);
+	fputs(str, sz_tsc->metadata_file);
+}
+
+int SZ_deregisterVar(char* varName)
+{
+	int state = SZ_batchDelVar(varName);
+	return state;
+}
+
+#ifdef HAVE_TIMECMPR
+int SZ_compress_ts(unsigned char** newByteData, size_t *outSize)
+{
+	confparams_cpr->szMode = SZ_TEMPORAL_COMPRESSION;
+	confparams_cpr->predictionMode = SZ_PREVIOUS_VALUE_ESTIMATE;
+	
+	SZ_VarSet* vset = sz_varset;
+	size_t *outSize_ = (size_t*)malloc(sizeof(size_t)*vset->count);
+	memset(outSize_, 0, sizeof(size_t)*vset->count);
+	unsigned char** compressBuffer = (unsigned char**)malloc(vset->count*sizeof(unsigned char*));//to store compressed bytes
+	
+	char *metadata_str = (char*)malloc(vset->count*256);
+	memset(metadata_str, 0, vset->count*256);
+	sprintf(metadata_str, "step %d", sz_tsc->currentStep);
+	
+	int i = 0, totalSize = 0;
+	for(i=0;i<vset->count;i++)
+	{
+		SZ_Variable* v = vset->header->next;
+		multisteps = v->multisteps; //assign the v's multisteps to the global variable 'multisteps', which will be used in the following compression.
+
+		if(v->dataType==SZ_FLOAT)
+		{
+			SZ_compress_args_float(&(compressBuffer[i]), (float*)v->data, v->r5, v->r4, v->r3, v->r2, v->r1, &outSize_[i], v->errBoundMode, v->absErrBound, v->relBoundRatio, v->pwRelBoundRatio);
+		}
+		else if(v->dataType==SZ_DOUBLE)
+		{
+			SZ_compress_args_double(&(compressBuffer[i]), (double*)v->data, v->r5, v->r4, v->r3, v->r2, v->r1, &outSize_[i], v->errBoundMode, v->absErrBound, v->relBoundRatio, v->pwRelBoundRatio);
+		}
+		sprintf(metadata_str, "%s:%d,%d,%zu", metadata_str, i, multisteps->lastSnapshotStep, outSize_[i]);
+		
+		totalSize += outSize_[i];
+		v->compressType = multisteps->compressionType;
+		v = v->next;
+	}
+	
+	sprintf(metadata_str, "%s\n", metadata_str);
+	fputs(metadata_str, sz_tsc->metadata_file);
+	free(metadata_str);
+	
+	//sizeof(int)==current time step; 2*sizeof(char)+sizeof(size_t)=={compressionType + datatype + compression_data_size}; 
+	//sizeof(char)==# variables
+	*outSize = sizeof(int) + sizeof(unsigned short) + totalSize + vset->count*(2*sizeof(unsigned char)+sizeof(size_t));
+	*newByteData = (unsigned char*)malloc(*outSize); 
+	unsigned char* p = *newByteData;
+
+	intToBytes_bigEndian(p, sz_tsc->currentStep);
+	p+=4;
+	shortToBytes(p, vset->count);
+	p+=2;
+	
+	for(i=0;i<vset->count;i++)
+	{
+		SZ_Variable* v = vset->header->next;
+	
+		*p = (unsigned char)v->compressType; //1 byte
+		p++;
+		*p = (unsigned char)v->dataType; //1 byte
+		p++;
+		sizeToBytes(p, outSize_[i]); //size_t
+		p += sizeof(size_t);
+		//sizeToBytes(p, v->r5); //size_t
+		//p += sizeof(size_t);
+		//sizeToBytes(p, v->r4); //size_t
+		//p += sizeof(size_t);
+		//sizeToBytes(p, v->r3); //size_t
+		//p += sizeof(size_t);
+		//sizeToBytes(p, v->r2); //size_t
+		//p += sizeof(size_t);
+		//sizeToBytes(p, v->r1); //size_t
+		//p += sizeof(size_t);								
+		memcpy(p, compressBuffer[i], outSize_[i]); //outSize_[i]
+		p += outSize_[i];
+	}
+
+	sz_tsc->currentStep ++;	
+	free(outSize_);
+	
+	return SZ_SCES;
+}
+
+void SZ_decompress_ts(unsigned char *bytes, size_t byteLength)
+{
+	if(confparams_dec==NULL)
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+	memset(confparams_dec, 0, sizeof(sz_params));
+	confparams_dec->szMode = SZ_TEMPORAL_COMPRESSION;
+	confparams_dec->predictionMode = SZ_PREVIOUS_VALUE_ESTIMATE;
+	
+	if(exe_params==NULL)
+		exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+	memset(exe_params, 0, sizeof(sz_exedata));
+	
+	int x = 1;
+	char *y = (char*)&x;
+	if(*y==1)
+		sysEndianType = LITTLE_ENDIAN_SYSTEM;
+	else //=0
+		sysEndianType = BIG_ENDIAN_SYSTEM;
+	
+	int i = 0;
+	size_t r5 = 0, r4 = 0, r3 = 0, r2 = 0, r1 = 0;
+	unsigned char* q = bytes;
+	sz_tsc->currentStep = bytesToInt_bigEndian(q); 
+	q += 4;
+	unsigned short nbVars = (unsigned short)bytesToShort(q);
+	q += 2;
+	
+	if(nbVars != sz_varset->count)
+	{
+		printf("Error: the number of variables in the compressed data file is inconsistent with the registered # variables.\n");
+		printf("Specifically, nbVars = %d, sz_varset->count = %d\n", nbVars, sz_varset->count);
+		return;
+	}
+	
+	float *newFloatData = NULL;
+	double *newDoubleData = NULL;	
+	
+	SZ_Variable* p = sz_varset->header->next; // p is pointed to the first variable.
+	for(i=0;i<sz_varset->count;i++)
+	{
+		multisteps = p->multisteps;
+		r5 = p->r5;
+		r4 = p->r4;
+		r3 = p->r3;
+		r2 = p->r2;
+		r1 = p->r1;
+		size_t dataLen = computeDataLength(r5, r4, r3, r2, r1);		
+		multisteps->compressionType = *(q++);
+		unsigned char dataType = *(q++);
+		size_t cmpSize = bytesToSize(q);
+		q += sizeof(size_t);
+		unsigned char* cmpBytes = q;
+		switch(dataType)
+		{
+		case SZ_FLOAT:
+				SZ_decompress_args_float(&newFloatData, r5, r4, r3, r2, r1, cmpBytes, cmpSize);
+				memcpy(p->data, newFloatData, dataLen*sizeof(float));
+				free(newFloatData);
+				break;
+		case SZ_DOUBLE:
+				SZ_decompress_args_double(&newDoubleData, r5, r4, r3, r2, r1, cmpBytes, cmpSize);
+				memcpy(p->data, newDoubleData, dataLen*sizeof(double));
+				free(newDoubleData);
+				break;
+		default:
+				printf("Error: data type cannot be the types other than SZ_FLOAT or SZ_DOUBLE\n");
+				return;	
+		}
+		
+		q += cmpSize;
+		p = p->next;
+	}
+}
+#endif
+
+
+void SZ_Finalize()
+{
+#ifdef HAVE_TIMECMPR		
+	if(sz_varset!=NULL)
+		SZ_freeVarSet(SZ_MAINTAIN_VAR_DATA);
+#endif
+
+	if(confparams_dec!=NULL)
+	{
+		free(confparams_dec);
+		confparams_dec = NULL;
+	}
+	if(confparams_cpr!=NULL)
+	{
+		free(confparams_cpr);
+		confparams_cpr = NULL;
+	}	
+	if(exe_params!=NULL)
+	{
+		free(exe_params);
+		exe_params = NULL;
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(sz_tsc!=NULL && sz_tsc->metadata_file!=NULL)
+		fclose(sz_tsc->metadata_file);
+#endif
+}
diff --git a/thirdparty/SZ/sz/src/sz_double.c b/thirdparty/SZ/sz/src/sz_double.c
new file mode 100644
index 0000000000000000000000000000000000000000..51819bd44f6ad83485a42267d63e24a1bc600319
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_double.c
@@ -0,0 +1,3265 @@
+/**
+ *  @file sz_double.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief SZ_Init, Compression and Decompression functions
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageD.h"
+#include "sz_double.h"
+#include "sz_double_pwr.h"
+#include "szd_double.h"
+#include "szd_double_pwr.h"
+#include "zlib.h"
+#include "rw.h"
+#include "sz_double_ts.h"
+
+unsigned char* SZ_skip_compress_double(double* data, size_t dataLength, size_t* outSize)
+{
+	*outSize = dataLength*sizeof(double);
+	unsigned char* out = (unsigned char*)malloc(dataLength*sizeof(double));
+	memcpy(out, data, dataLength*sizeof(double));
+	return out;
+}
+
+void computeReqLength_double(double realPrecision, short radExpo, int* reqLength, double* medianValue)
+{
+	short reqExpo = getPrecisionReqLength_double(realPrecision);
+	*reqLength = 12+radExpo - reqExpo; //radExpo-reqExpo == reqMantiLength
+	if(*reqLength<12)
+		*reqLength = 12;
+	if(*reqLength>64)
+	{
+		*reqLength = 64;
+		*medianValue = 0;
+	}
+}
+
+unsigned int optimize_intervals_double_1D(double *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_2D(double *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = fabs(pred_value - oriData[index]);
+				radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;	
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_3D(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = fabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+					intervals[radiusIndex]++;
+				}				
+			}
+			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_4D(double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = fabs(pred_value - oriData[index]);
+						radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ(double *oriData, 
+size_t dataLength, double realPrecision, double valueRangeSize, double medianValue_d)
+{
+#ifdef HAVE_TIMECMPR
+	double* decData = NULL;	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (double*)(multisteps->hist_data);
+#endif	
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_double_1D_opt(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+
+	size_t i;
+	int reqLength;
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	double* spaceFillingValue = oriData; //
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	double last3CmprsData[3] = {0};
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));			
+				
+	//add the first data	
+	type[0] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[0] = vce->data;
+#endif		
+		
+	//add the second data
+	type[1] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = vce->data;
+#endif
+	int state;
+	double checkRadius;
+	double curData;
+	double pred;
+	double predAbsErr;
+	checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	double interval = 2*realPrecision;
+
+	for(i=2;i<dataLength;i++)
+	{				
+		//printf("%.30G\n",last3CmprsData[0]);
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = fabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			listAdd_double(last3CmprsData, pred);
+#ifdef HAVE_TIMECMPR					
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[i] = pred;			
+#endif	
+			continue;
+		}
+		
+		//unpredictable data processing
+		type[i] = 0;		
+		compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+							
+		listAdd_double(last3CmprsData, vce->data);
+#ifdef HAVE_TIMECMPR
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[i] = vce->data;
+#endif	
+		
+	}//end of for
+		
+	int exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+	
+//	printf("exactDataNum=%d, expSegmentsInBytes_size=%d, exactMidByteArray->size=%d\n", 
+//			exactDataNum, expSegmentsInBytes_size, exactMidByteArray->size);
+	
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);	
+	
+	return tdps;	
+}
+
+void SZ_compress_args_double_StoreOriData(double* oriData, size_t dataLength, TightDataPointStorageD* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int doubleSize = sizeof(double);
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + doubleSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+	
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;
+
+	sizeToBytes(dsLengthBytes,dataLength);
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)//ST: 4 or 8
+		(*newByteData)[k++] = dsLengthBytes[i];
+
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*doubleSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=doubleSize)
+			doubleToBytes(p, oriData[i]);
+	}
+	*outSize = totalByteLength;
+}
+
+
+char SZ_compress_args_double_NoCkRngeNoGzip_1D(unsigned char** newByteData, double *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d)
+{
+	char compressionType = 0;	
+	TightDataPointStorageD* tdps = NULL; 	
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		int timestep = sz_tsc->currentStep;
+		if(timestep % confparams_cpr->snapshotCmprStep != 0)
+		{
+			tdps = SZ_compress_double_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_d);
+			compressionType = 1; //time-series based compression 
+		}
+		else
+		{	
+			tdps = SZ_compress_double_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, medianValue_d);
+			compressionType = 0; //snapshot-based compression
+			multisteps->lastSnapshotStep = timestep;
+		}		
+	}
+	else
+#endif
+		tdps = SZ_compress_double_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, medianValue_d);			
+	
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+	
+	if(*outSize>dataLength*sizeof(double))
+		SZ_compress_args_double_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageD(tdps);	
+	return compressionType;
+}
+
+TightDataPointStorageD* SZ_compress_double_2D_MDQ(double *oriData, size_t r1, size_t r2, double realPrecision, double valueRangeSize, double medianValue_d)
+{
+#ifdef HAVE_TIMECMPR	
+	double* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (double*)(multisteps->hist_data);
+#endif	
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_2D_opt(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;	
+	size_t i,j; 
+	int reqLength;
+	double pred1D, pred2D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (double*)malloc(r2*sizeof(double));
+	memset(P0, 0, r2*sizeof(double));
+	P1 = (double*)malloc(r2*sizeof(double));
+	memset(P1, 0, r2*sizeof(double));
+		
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	double* spaceFillingValue = oriData; //
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+			
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[0] = vce->data;
+#endif	
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = P1[1];
+#endif
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[j], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[j] = P1[j];
+#endif		
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P0[0];
+#endif
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[j];
+#endif			
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+		
+	if(r2!=1)	
+		free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);
+
+	for(i=0;i<dataLength;i++)
+		printf("%d ", type[i]);
+	printf("\n");*/
+
+//	printf("exactDataNum=%d, expSegmentsInBytes_size=%d, exactMidByteArray->size=%d\n", 
+//			exactDataNum, expSegmentsInBytes_size, exactMidByteArray->size);
+	
+//	for(i = 3800;i<3844;i++)
+//		printf("exactLeadNumArray->array[%d]=%d\n",i,exactLeadNumArray->array[i]);
+	
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+char SZ_compress_args_double_NoCkRngeNoGzip_2D(unsigned char** newByteData, double *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d)
+{
+	size_t dataLength = r1*r2;
+	char compressionType = 0;	
+	TightDataPointStorageD* tdps = NULL; 	
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		int timestep = sz_tsc->currentStep;
+		if(timestep % confparams_cpr->snapshotCmprStep != 0)
+		{
+			tdps = SZ_compress_double_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_d);
+			compressionType = 1; //time-series based compression 
+		}
+		else
+		{	
+			tdps = SZ_compress_double_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, medianValue_d);
+			compressionType = 0; //snapshot-based compression
+			multisteps->lastSnapshotStep = timestep;
+		}		
+	}
+	else
+#endif
+		tdps = SZ_compress_double_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, medianValue_d);	
+	
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+	
+	if(*outSize>dataLength*sizeof(double))
+		SZ_compress_args_double_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);	
+	
+	free_TightDataPointStorageD(tdps);
+	return compressionType;
+}
+
+TightDataPointStorageD* SZ_compress_double_3D_MDQ(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, double valueRangeSize, double medianValue_d)
+{
+#ifdef HAVE_TIMECMPR
+	double* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (double*)(multisteps->hist_data);
+#endif		
+
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_3D_opt(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int reqLength;
+	double pred1D, pred2D, pred3D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+
+	size_t dataLength = r1*r2*r3;
+
+	size_t r23 = r2*r3;
+
+	P0 = (double*)malloc(r23*sizeof(double));
+	P1 = (double*)malloc(r23*sizeof(double));
+
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+
+	double* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	type[0] = 0;
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[0] = P1[0];
+#endif
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = P1[1];
+#endif
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[j], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[j] = P1[j];
+#endif		
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P1[index];
+#endif		
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P1[index];
+#endif			
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P0[0];
+#endif
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[j];
+#endif			
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[index2D];
+#endif			
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				//index = k*r2*r3 + i*r3 + j;
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+#ifdef HAVE_TIMECMPR	
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					decData[index] = P0[index2D];
+#endif				
+			}
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+//	printf("exactDataNum=%d, expSegmentsInBytes_size=%d, exactMidByteArray->size=%d\n",
+//			exactDataNum, expSegmentsInBytes_size, exactMidByteArray->size);
+
+//	for(i = 3800;i<3844;i++)
+//		printf("exactLeadNumArray->array[%d]=%d\n",i,exactLeadNumArray->array[i]);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);	
+	
+	return tdps;	
+}
+
+
+char SZ_compress_args_double_NoCkRngeNoGzip_3D(unsigned char** newByteData, double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d)
+{
+	size_t dataLength = r1*r2*r3;
+	char compressionType = 0;	
+	TightDataPointStorageD* tdps = NULL; 	
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		int timestep = sz_tsc->currentStep;
+		if(timestep % confparams_cpr->snapshotCmprStep != 0)
+		{
+			tdps = SZ_compress_double_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_d);
+			compressionType = 1; //time-series based compression 
+		}
+		else
+		{	
+			tdps = SZ_compress_double_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_d);
+			compressionType = 0; //snapshot-based compression
+			multisteps->lastSnapshotStep = timestep;
+		}		
+	}
+	else
+#endif
+		tdps = SZ_compress_double_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_d);		
+
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+
+	if(*outSize>dataLength*sizeof(double))
+		SZ_compress_args_double_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+	return compressionType;
+}
+
+TightDataPointStorageD* SZ_compress_double_4D_MDQ(double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, double valueRangeSize, double medianValue_d)
+{
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+
+	size_t i,j,k; 
+	int reqLength;
+	double pred1D, pred2D, pred3D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+
+	size_t dataLength = r1*r2*r3*r4;
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (double*)malloc(r34*sizeof(double));
+	P1 = (double*)malloc(r34*sizeof(double));
+
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	double* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[index2D] = vce->data;
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P1[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P1[index2D] = vce->data;
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = fabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[index] = 0;
+						compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+						updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+						memcpy(preDataBytes,vce->curBytes,8);
+						addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+						P0[index2D] = vce->data;
+					}
+				}
+			}
+
+			double *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+
+char SZ_compress_args_double_NoCkRngeNoGzip_4D(unsigned char** newByteData, double *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d)
+{
+	TightDataPointStorageD* tdps = SZ_compress_double_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, medianValue_d);
+
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(double))
+		SZ_compress_args_double_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+	return 0;
+}
+
+void SZ_compress_args_double_withinRange(unsigned char** newByteData, double *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageD* tdps = (TightDataPointStorageD*) malloc(sizeof(TightDataPointStorageD));
+	tdps->rtypeArray = NULL;
+	tdps->typeArray = NULL;
+	tdps->leadNumArray = NULL;
+	tdps->residualMidBits = NULL;
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactMidBytes = (unsigned char*)malloc(sizeof(unsigned char)*8);
+	tdps->pwrErrBoundBytes = NULL;
+	tdps->isLossless = 0;
+	double value = oriData[0];
+	doubleToBytes(tdps->exactMidBytes, value);
+	tdps->exactMidBytes_size = 8;
+	
+	size_t tmpOutSize;
+	//unsigned char *tmpByteData;
+	convertTDPStoFlatBytes_double(tdps, newByteData, &tmpOutSize);
+	//convertTDPStoFlatBytes_double(tdps, &tmpByteData, &tmpOutSize);
+
+	//*newByteData = (unsigned char*)malloc(sizeof(unsigned char)*16); //for floating-point data (1+3+4+4)
+	//memcpy(*newByteData, tmpByteData, 16);
+	*outSize = tmpOutSize;//12==3+1+8(double_size)+MetaDataByteLength
+	free_TightDataPointStorageD(tdps);	
+}
+
+int SZ_compress_args_double_wRngeNoGzip(unsigned char** newByteData, double *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwrErrRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	double valueRangeSize = 0, medianValue = 0;
+	
+	double min = computeRangeSize_double(oriData, dataLength, &valueRangeSize, &medianValue);
+	double max = min+valueRangeSize;
+	double realPrecision = getRealPrecision_double(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_double_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			if(errBoundMode>=PW_REL)
+			{
+				//SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr(newByteData, oriData, realPrecision, r1, outSize, min, max);
+				SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(newByteData, oriData, r1, absErr_Bound, relBoundRatio, pwrErrRatio, valueRangeSize, medianValue, outSize);				
+			}
+			else
+				SZ_compress_args_double_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			if(errBoundMode>=PW_REL)
+				SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr(newByteData, oriData, realPrecision, r2, r1, outSize, min, max);
+			else
+				SZ_compress_args_double_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			if(errBoundMode>=PW_REL)
+				SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr(newByteData, oriData, realPrecision, r3, r2, r1, outSize, min, max);
+			else
+				SZ_compress_args_double_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+		else if(r5==0)
+		{
+			if(errBoundMode>=PW_REL)
+				SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr(newByteData, oriData, realPrecision, r4*r3, r2, r1, outSize, min, max);
+			else
+				SZ_compress_args_double_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_double(unsigned char** newByteData, double *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	if(errBoundMode==PW_REL)
+	{
+		confparams_cpr->pw_relBoundRatio = pwRelBoundRatio;	
+		//confparams_cpr->pwr_type = SZ_PWR_MIN_TYPE;
+		if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE && r3 != 0 )
+		{
+			printf("Error: Current version doesn't support 3D data compression with point-wise relative error bound being based on pwrType=AVG\n");
+			exit(0);
+			return SZ_NSCS;
+		}
+	}				
+		
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	if(dataLength <= MIN_NUM_OF_ELEMENTS)
+	{
+		*newByteData = SZ_skip_compress_double(oriData, dataLength, outSize);
+		return status;
+	}
+	
+	double valueRangeSize = 0, medianValue = 0;
+	
+	double min = computeRangeSize_double(oriData, dataLength, &valueRangeSize, &medianValue);
+	double max = min+valueRangeSize;
+
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, valueRangeSize);
+	}
+	else
+		realPrecision = getRealPrecision_double(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_double_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+			{
+				//SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr(&tmpByteData, oriData, realPrecision, r1, &tmpOutSize, min, max);
+				SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(&tmpByteData, oriData, r1, absErr_Bound, relBoundRatio, pwRelBoundRatio, 
+				valueRangeSize, medianValue, &tmpOutSize);
+			}
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)			
+					multisteps->compressionType = SZ_compress_args_double_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif
+					SZ_compress_args_double_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+		}
+		else
+		if (r3==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+				SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr(&tmpByteData, oriData, realPrecision, r2, r1, &tmpOutSize, min, max);
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)			
+					multisteps->compressionType = SZ_compress_args_double_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif
+					SZ_compress_args_double_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+		}
+		else
+		if (r4==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+				SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr(&tmpByteData, oriData, realPrecision, r3, r2, r1, &tmpOutSize, min, max);
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					multisteps->compressionType = SZ_compress_args_double_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif
+					SZ_compress_args_double_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+		}
+		else
+		if (r5==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+				SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr(&tmpByteData, oriData, realPrecision, r4*r3, r2, r1, &tmpOutSize, min, max);
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)			
+					multisteps->compressionType = SZ_compress_args_double_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif
+					SZ_compress_args_double_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR;
+		}
+				
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;			
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+			status = SZ_MERR;	
+		}
+	}
+
+	return status;
+}
+
+//TODO
+int SZ_compress_args_double_subblock(unsigned char* compressedBytes, double *oriData,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1,
+size_t *outSize, int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	double valueRangeSize = 0, medianValue = 0;
+	computeRangeSize_double_subblock(oriData, &valueRangeSize, &medianValue, r5, r4, r3, r2, r1, s5, s4, s3, s2, s1, e5, e4, e3, e2, e1);
+
+	double realPrecision = getRealPrecision_double(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		//TODO
+		//SZ_compress_args_double_withinRange_subblock();
+	}
+	else
+	{
+		if (r2==0)
+		{
+			//TODO
+			if(errBoundMode==PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_double_NoCkRnge_1D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r1, s1, e1);
+		}
+		else
+		if (r3==0)
+		{
+			if(errBoundMode==PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_double_NoCkRnge_2D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r2, r1, s2, s1, e2, e1);
+		}
+		else
+		if (r4==0)
+		{
+			if(errBoundMode==PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_double_NoCkRnge_3D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r3, r2, r1, s3, s2, s1, e3, e2, e1);
+		}
+		else
+		if (r5==0)
+		{
+			if(errBoundMode==PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_double_NoCkRngeNoGzip_4D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_double_NoCkRnge_4D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r4, r3, r2, r1, s4, s3, s2, s1, e4, e3, e2, e1);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+	}
+	return status;
+}
+
+void SZ_compress_args_double_NoCkRnge_1D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r1, size_t s1, size_t e1)
+{
+	TightDataPointStorageD* tdps = SZ_compress_double_1D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_d, r1, s1, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_double_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_double(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(double))
+//		SZ_compress_args_double_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+}
+
+void SZ_compress_args_double_NoCkRnge_2D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r2, size_t r1, size_t s2, size_t s1, size_t e2, size_t e1)
+{
+	TightDataPointStorageD* tdps = SZ_compress_double_2D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_d, r2, r1, s2, s1, e2, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_double_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_double(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(double))
+//		SZ_compress_args_double_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+}
+
+void SZ_compress_args_double_NoCkRnge_3D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r3, size_t r2, size_t r1, size_t s3, size_t s2, size_t s1, size_t e3, size_t e2, size_t e1)
+{
+	TightDataPointStorageD* tdps = SZ_compress_double_3D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_d, r3, r2, r1, s3, s2, s1, e3, e2, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_double_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_double(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(double))
+//		SZ_compress_args_double_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+}
+
+void SZ_compress_args_double_NoCkRnge_4D_subblock(unsigned char* compressedBytes, double *oriData, double realPrecision, size_t *outSize, double valueRangeSize, double medianValue_d,
+size_t r4, size_t r3, size_t r2, size_t r1, size_t s4, size_t s3, size_t s2, size_t s1, size_t e4, size_t e3, size_t e2, size_t e1)
+{
+	TightDataPointStorageD* tdps = SZ_compress_double_4D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_d, r4, r3, r2, r1, s4, s3, s2, s1, e4, e3, e2, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_double_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_double(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(double))
+//		SZ_compress_args_double_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageD(tdps);
+}
+
+
+unsigned int optimize_intervals_double_1D_subblock(double *oriData, double realPrecision, size_t r1, size_t s1, size_t e1)
+{
+	size_t dataLength = e1 - s1 + 1;
+	oriData = oriData + s1;
+
+	size_t i = 0;
+	unsigned long radiusIndex;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			pred_value = 2*oriData[i-1] - oriData[i-2];
+			//pred_value = oriData[i-1];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_2D_subblock(double *oriData, double realPrecision, size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2)
+{
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+
+	size_t i,j, index;
+	unsigned long radiusIndex;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = R1*R2/confparams_cpr->sampleDistance;
+	for(i=s1+1;i<=e1;i++)
+	{
+		for(j=s2+1;j<=e2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = fabs(pred_value - oriData[index]);
+				radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_3D_subblock(double *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3)
+{
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+
+	size_t r23 = r2*r3;
+
+	size_t i,j,k, index;
+	unsigned long radiusIndex;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = R1*R2*R3/confparams_cpr->sampleDistance;
+	for(i=s1+1;i<=e1;i++)
+	{
+		for(j=s2+1;j<=e2;j++)
+		{
+			for(k=s3+1;k<=e3;k++)
+			{
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23]
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = fabs(pred_value - oriData[index]);
+					radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+					intervals[radiusIndex]++;
+				}
+			}
+
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_4D_subblock(double *oriData, double realPrecision,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4)
+{
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+	size_t R4 = e4 - s4 + 1;
+
+	size_t r34 = r3*r4;
+	size_t r234 = r2*r3*r4;
+
+	size_t i,j,k,l, index;
+	unsigned long radiusIndex;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = R1*R2*R3*R4/confparams_cpr->sampleDistance;
+	for(i=s1+1;i<=e1;i++)
+	{
+		for(j=s2+1;j<=e2;j++)
+		{
+			for(k=s3+1;k<=e3;k++)
+			{
+				for(l=s4+1;l<=e4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r4] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = fabs(pred_value - oriData[index]);
+						radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t s1, size_t e1)
+{
+	size_t dataLength = e1 - s1 + 1;
+
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_double_1D_subblock(oriData, realPrecision, r1, s1, e1);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);
+
+	size_t i; 
+	int reqLength;
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	double* spaceFillingValue = oriData + s1; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	type[0] = 0;
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	double last3CmprsData[3] = {0};
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+	//add the first data
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+
+	//add the second data
+	type[1] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+
+	int state;
+	double checkRadius;
+	double curData;
+	double pred;
+	double predAbsErr;
+	checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	double interval = 2*realPrecision;
+
+	for(i=2;i<dataLength;i++)
+	{
+		//printf("%.30G\n",last3CmprsData[0]);
+		curData = spaceFillingValue[i];
+		pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		//pred = last3CmprsData[0];
+		predAbsErr = fabs(curData - pred);
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			listAdd_double(last3CmprsData, pred);
+			continue;
+		}
+
+		//unpredictable data processing
+		type[i] = 0;
+		compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+		listAdd_double(last3CmprsData, vce->data);
+	}//end of for
+
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+
+TightDataPointStorageD* SZ_compress_double_2D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2)
+{
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_2D_subblock(oriData, realPrecision, r1, r2, s1, s2, e1, e2);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+
+	size_t i,j; 
+	int reqLength;
+	double pred1D, pred2D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t dataLength = R1*R2;
+
+	P0 = (double*)malloc(R2*sizeof(double));
+	memset(P0, 0, R2*sizeof(double));
+	P1 = (double*)malloc(R2*sizeof(double));
+	memset(P1, 0, R2*sizeof(double));
+
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	double* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+	/* Process Row-s1 data s2*/
+	size_t gIndex;
+	size_t lIndex;
+
+	gIndex = s1*r2+s2;
+	lIndex = 0;
+
+	type[lIndex] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+
+	/* Process Row-s1 data s2+1*/
+	gIndex = s1*r2+(s2+1);
+	lIndex = 1;
+
+	pred1D = P1[0];
+	diff = spaceFillingValue[gIndex] - pred1D;
+
+	itvNum =  fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[lIndex] = 0;
+		compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+
+    /* Process Row-s1 data s2+2 --> data e2 */
+	for (j = 2; j < R2; j++)
+	{
+		gIndex = s1*r2+(s2+j);
+		lIndex = j;
+
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+	}
+
+	/* Process Row-s1+1 --> Row-e1 */
+	for (i = 1; i < R1; i++)
+	{
+		/* Process row-s1+i data s2 */
+		gIndex = (s1+i)*r2+s2;
+		lIndex = i*R2;
+
+		pred1D = P1[0];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+
+		/* Process row-s1+i data s2+1 --> e2 */
+		for (j = 1; j < R2; j++)
+		{
+			gIndex = (s1+i)*r2+(s2+j);
+			lIndex = i*R2+j;
+
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+TightDataPointStorageD* SZ_compress_double_3D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3)
+{
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_3D_subblock(oriData, realPrecision, r1, r2, r3, s1, s2, s3, e1, e2, e3);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+
+	size_t i,j,k; 
+	int reqLength;
+	double pred1D, pred2D, pred3D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+	size_t dataLength = R1*R2*R3;
+
+	size_t r23 = r2*r3;
+	size_t R23 = R2*R3;
+
+	P0 = (double*)malloc(R23*sizeof(double));
+	P1 = (double*)malloc(R23*sizeof(double));
+
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	double* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	///////////////////////////	Process layer-s1 ///////////////////////////
+	/* Process Row-s2 data s3*/
+	size_t gIndex; 	//global index
+	size_t lIndex; 	//local index
+	size_t index2D; 	//local 2D index
+
+	gIndex = s1*r23+s2*r3+s3;
+	lIndex = 0;
+	index2D = 0;
+
+	type[lIndex] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[index2D] = vce->data;
+
+	/* Process Row-s2 data s3+1*/
+	gIndex = s1*r23+s2*r3+s3+1;
+	lIndex = 1;
+	index2D = 1;
+
+	pred1D = P1[index2D-1];
+	diff = spaceFillingValue[gIndex] - pred1D;
+
+	itvNum = fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[lIndex] = 0;
+		compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[index2D] = vce->data;
+	}
+
+    /* Process Row-s2 data s3+2 --> data e3 */
+	for (j = 2; j < R3; j++)
+	{
+		gIndex = s1*r23+s2*r3+s3+j;
+		lIndex = j;
+		index2D = j;
+
+		pred1D = 2*P1[index2D-1] - P1[index2D-2];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+	}
+
+	/* Process Row-s2+1 --> Row-e2 */
+	for (i = 1; i < R2; i++)
+	{
+		/* Process row-s2+i data s3 */
+		gIndex = s1*r23+(s2+i)*r3+s3;
+		lIndex = i*R3;
+		index2D = i*R3;
+
+		pred1D  = P1[index2D-R3];
+		diff    = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum  = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+
+		/* Process row-s2+i data s3+1 --> data e3*/
+		for (j = 1; j < R3; j++)
+		{
+			gIndex = s1*r23+(s2+i)*r3+s3+j;
+			lIndex = i*R3+j;
+			index2D = i*R3+j;
+
+			pred2D  = P1[index2D-1] + P1[index2D-R3] - P1[index2D-R3-1];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-s1+1 --> layer-e1 ///////////////////////////
+
+	for (k = 1; k < R1; k++)
+	{
+		/* Process Row-s2 data s3*/
+		gIndex = (s1+k)*r23+s2*r3+s3;
+		lIndex = k*R23;
+		index2D = 0;
+
+		pred1D = P1[index2D];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[index2D] = vce->data;
+		}
+
+
+	    /* Process Row-s2 data s3+1 --> data e3 */
+		for (j = 1; j < R3; j++)
+		{
+			gIndex = (s1+k)*r23+s2*r3+s3+j;
+			lIndex = k*R23+j;
+			index2D = j;
+
+			pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+		}
+
+	    /* Process Row-s2+1 --> Row-e2 */
+		for (i = 1; i < R2; i++)
+		{
+			/* Process Row-s2+i data s3 */
+			gIndex = (s1+k)*r23+(s2+i)*r3+s3;
+			lIndex = k*R23+i*R3;
+			index2D = i*R3;
+
+			pred2D = P0[index2D-R3] + P1[index2D] - P1[index2D-R3];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+			/* Process Row-s2+i data s3+1 --> data e3 */
+			for (j = 1; j < R3; j++)
+			{
+				gIndex = (s1+k)*r23+(s2+i)*r3+s3+j;
+				lIndex = k*R23+i*R3+j;
+				index2D = i*R3+j;
+
+				pred3D = P0[index2D-1] + P0[index2D-R3]+ P1[index2D] - P0[index2D-R3-1] - P1[index2D-R3] - P1[index2D-1] + P1[index2D-R3-1];
+				diff = spaceFillingValue[gIndex] - pred3D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+TightDataPointStorageD* SZ_compress_double_4D_MDQ_subblock(double *oriData, double realPrecision, double valueRangeSize, double medianValue_d,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4)
+{
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_4D_subblock(oriData, realPrecision, r1, r2, r3, r4, s1, s2, s3, s4, e1, e2, e3, e4);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+
+	size_t i,j,k; 
+	int reqLength;
+	double pred1D, pred2D, pred3D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+	size_t R4 = e4 - s4 + 1;
+
+	size_t dataLength = R1*R2*R3*R4;
+
+	size_t r34 = r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t R34 = R3*R4;
+	size_t R234 = R2*R3*R4;
+
+	P0 = (double*)malloc(R34*sizeof(double));
+	P1 = (double*)malloc(R34*sizeof(double));
+
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	double* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+	size_t l;
+	for (l = 0; l < R1; l++)
+	{
+
+		///////////////////////////	Process layer-s2 ///////////////////////////
+		/* Process Row-s3 data s4*/
+		size_t gIndex; 	//global index
+		size_t lIndex; 	//local index
+		size_t index2D; 	//local 2D index
+
+		gIndex = (s1+l)*r234+s2*r34+s3*r4+s4;
+		lIndex = l*R234;
+		index2D = 0;
+
+		type[lIndex] = 0;
+		compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[index2D] = vce->data;
+
+		/* Process Row-s3 data s4+1*/
+		gIndex = (s1+l)*r234+s2*r34+s3*r4+s4+1;
+		lIndex = l*R234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+
+		/* Process Row-s3 data s4+2 --> data e4 */
+		for (j = 2; j < R4; j++)
+		{
+			gIndex = (s1+l)*r234+s2*r34+s3*r4+s4+j;
+			lIndex = l*R234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[gIndex] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+		}
+
+		/* Process Row-s3+1 --> Row-e3 */
+		for (i = 1; i < R3; i++)
+		{
+			/* Process row-s2+i data s3 */
+			gIndex = (s1+l)*r234+s2*r34+(s3+i)*r4+s4;
+			lIndex = l*R234+i*R4;
+			index2D = i*R4;
+
+			pred1D  = P1[index2D-R4];
+			diff    = spaceFillingValue[gIndex] - pred1D;
+
+			itvNum  = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+
+			/* Process row-s3+i data s4+1 --> data e4*/
+			for (j = 1; j < R4; j++)
+			{
+				gIndex = (s1+l)*r234+s2*r34+(s3+i)*r4+s4+j;
+				lIndex = l*R234+i*R4+j;
+				index2D = i*R4+j;
+
+				pred2D  = P1[index2D-1] + P1[index2D-R4] - P1[index2D-R4-1];
+				diff = spaceFillingValue[gIndex] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+					P1[index2D] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P1[index2D] = vce->data;
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-s2+1 --> layer-e2 ///////////////////////////
+
+		for (k = 1; k < R2; k++)
+		{
+			/* Process Row-s3 data s4*/
+			gIndex = (s1+l)*r234+(s2+k)*r34+s3*r4+s4;
+			lIndex = l*R234+k*R34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[gIndex] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+
+			/* Process Row-s3 data s4+1 --> data e4 */
+			for (j = 1; j < R4; j++)
+			{
+				gIndex = (s1+l)*r234+(s2+k)*r34+s3*r4+s4+j;
+				lIndex = l*R234+k*R34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[gIndex] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+
+			/* Process Row-s3+1 --> Row-e3 */
+			for (i = 1; i < R3; i++)
+			{
+				/* Process Row-s3+i data s4 */
+				gIndex = (s1+l)*r234+(s2+k)*r34+(s3+i)*r4+s4;
+				lIndex = l*R234+k*R34+i*R4;
+				index2D = i*R4;
+
+				pred2D = P0[index2D-R4] + P1[index2D] - P1[index2D-R4];
+				diff = spaceFillingValue[gIndex] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+
+				/* Process Row-s3+i data s4+1 --> data e4 */
+				for (j = 1; j < R4; j++)
+				{
+					gIndex = (s1+l)*r234+(s2+k)*r34+(s3+i)*r4+s4+j;
+					lIndex = l*R234+k*R34+i*R4+j;
+					index2D = i*R4+j;
+
+//					printf ("global index = %d, local index = %d\n", gIndex, lIndex);
+
+					pred3D = P0[index2D-1] + P0[index2D-R4]+ P1[index2D] - P0[index2D-R4-1] - P1[index2D-R4] - P1[index2D-1] + P1[index2D-R4-1];
+					diff = spaceFillingValue[gIndex] - pred3D;
+
+					itvNum = fabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+						P0[index2D] = pred3D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[lIndex] = 0;
+						compressSingleDoubleValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+						updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+						memcpy(preDataBytes,vce->curBytes,8);
+						addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+						P0[index2D] = vce->data;
+					}
+				}
+			}
+
+			double *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+/**
+ * 
+ * This is a fast implementation for optimize_intervals_double_3D()
+ * */
+unsigned int optimize_intervals_double_3D_opt(double *oriData, size_t r1, size_t r2, size_t r3, double realPrecision){	
+	size_t i;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;
+
+	size_t offset_count = confparams_cpr->sampleDistance - 2; // count r3 offset
+	size_t offset_count_2;
+	double * data_pos = oriData + r23 + r3 + offset_count;
+	size_t n1_count = 1, n2_count = 1; // count i,j sum
+	size_t len = r1 * r2 * r3;
+	while(data_pos - oriData < len){
+		totalSampleSize++;
+		pred_value = data_pos[-1] + data_pos[-r3] + data_pos[-r23] - data_pos[-1-r23] - data_pos[-r3-1] - data_pos[-r3-r23] + data_pos[-r3-r23-1];
+		pred_err = fabs(pred_value - *data_pos);
+		radiusIndex = (pred_err/realPrecision+1)/2;
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+		{
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;
+			//printf("radiusIndex=%d\n", radiusIndex);
+		}
+		intervals[radiusIndex]++;
+		// printf("TEST: %ld, i: %ld\tj: %ld\tk: %ld\n", data_pos - oriData);
+		// fflush(stdout);
+		offset_count += confparams_cpr->sampleDistance;
+		if(offset_count >= r3){
+			n2_count ++;
+			if(n2_count == r2){
+				n1_count ++;
+				n2_count = 1;
+				data_pos += r3;
+			}
+			offset_count_2 = (n1_count + n2_count) % confparams_cpr->sampleDistance;
+			data_pos += (r3 + confparams_cpr->sampleDistance - offset_count) + (confparams_cpr->sampleDistance - offset_count_2);
+			offset_count = (confparams_cpr->sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += confparams_cpr->sampleDistance;
+	}	
+	// printf("sample_count: %ld\n", sample_count);
+	// fflush(stdout);
+	// if(*max_freq < 0.15) *max_freq *= 2;
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_2D_opt(double *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i;
+	size_t radiusIndex;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;//(r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+
+	size_t offset_count = confparams_cpr->sampleDistance - 1; // count r2 offset
+	size_t offset_count_2;
+	double * data_pos = oriData + r2 + offset_count;
+	size_t n1_count = 1; // count i sum
+	size_t len = r1 * r2;
+	while(data_pos - oriData < len){
+		totalSampleSize++;
+		pred_value = data_pos[-1] + data_pos[-r2] - data_pos[-r2-1];
+		pred_err = fabs(pred_value - *data_pos);
+		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;
+		intervals[radiusIndex]++;
+
+		offset_count += confparams_cpr->sampleDistance;
+		if(offset_count >= r2){
+			n1_count ++;
+			offset_count_2 = n1_count % confparams_cpr->sampleDistance;
+			data_pos += (r2 + confparams_cpr->sampleDistance - offset_count) + (confparams_cpr->sampleDistance - offset_count_2);
+			offset_count = (confparams_cpr->sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += confparams_cpr->sampleDistance;
+	}
+
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_double_1D_opt(double *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;//dataLength/confparams_cpr->sampleDistance;
+
+	double * data_pos = oriData + 2;
+	while(data_pos - oriData < dataLength){
+		totalSampleSize++;
+		//pred_value = 2*data_pos[-1] - data_pos[-2];
+		pred_value = data_pos[-1];
+		pred_err = fabs(pred_value - *data_pos);
+		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+		intervals[radiusIndex]++;
+
+		data_pos += confparams_cpr->sampleDistance;
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
diff --git a/thirdparty/SZ/sz/src/sz_double_pwr.c b/thirdparty/SZ/sz/src/sz_double_pwr.c
new file mode 100644
index 0000000000000000000000000000000000000000..59be38cc48fa85bfc9ce147c2307beeb145fcc27
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_double_pwr.c
@@ -0,0 +1,1774 @@
+/**
+ *  @file sz_double_pwr.c
+ *  @author Sheng Di
+ *  @date Aug, 2016
+ *  @brief SZ_Init, Compression and Decompression functions
+ * This file contains the compression/decompression functions related to point-wise relative errors
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageD.h"
+#include "sz_double.h"
+#include "sz_double_pwr.h"
+#include "zlib.h"
+#include "rw.h"
+
+void compute_segment_precisions_double_1D(double *oriData, size_t dataLength, double* pwrErrBound, unsigned char* pwrErrBoundBytes, double globalPrecision)
+{
+	size_t i = 0, j = 0, k = 0;
+	double realPrecision = oriData[0]!=0?fabs(confparams_cpr->pw_relBoundRatio*oriData[0]):confparams_cpr->pw_relBoundRatio; 
+	double approxPrecision;
+	unsigned char realPrecBytes[8];
+	double curPrecision;
+	double curValue;
+	double sum = 0;
+	for(i=0;i<dataLength;i++)
+	{
+		curValue = oriData[i];
+		if(i%confparams_cpr->segment_size==0&&i>0)
+		{
+			//get two first bytes of the realPrecision
+			if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+			{
+				realPrecision = sum/confparams_cpr->segment_size;
+				sum = 0;			
+			}
+			realPrecision *= confparams_cpr->pw_relBoundRatio;
+			if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+				realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+			else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+				realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;
+				
+			doubleToBytes(realPrecBytes, realPrecision);
+			memset(&realPrecBytes[2], 0, 6);
+			approxPrecision = bytesToDouble(realPrecBytes);
+			//put the realPrecision in double* pwrErBound
+			pwrErrBound[j++] = approxPrecision;
+			//put the two bytes in pwrErrBoundBytes
+			pwrErrBoundBytes[k++] = realPrecBytes[0];
+			pwrErrBoundBytes[k++] = realPrecBytes[1];
+			
+			realPrecision = fabs(curValue);
+		}
+		
+		if(curValue!=0)
+		{
+			curPrecision = fabs(curValue);
+			
+			switch(confparams_cpr->pwr_type)
+			{
+			case SZ_PWR_MIN_TYPE: 
+				if(realPrecision>curPrecision)
+					realPrecision = curPrecision;	
+				break;
+			case SZ_PWR_AVG_TYPE:
+				sum += curPrecision;
+				break;
+			case SZ_PWR_MAX_TYPE:
+				if(realPrecision<curPrecision)
+					realPrecision = curPrecision;					
+				break;
+			}
+		}
+	}
+	if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+	{
+		int size = dataLength%confparams_cpr->segment_size==0?confparams_cpr->segment_size:dataLength%confparams_cpr->segment_size;
+		realPrecision = sum/size;		
+	}	
+	if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+		realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+	else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+		realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;	
+	doubleToBytes(realPrecBytes, realPrecision);
+	memset(&realPrecBytes[2], 0, 6);
+	approxPrecision = bytesToDouble(realPrecBytes);
+	//put the realPrecision in double* pwrErBound
+	pwrErrBound[j++] = approxPrecision;
+	//put the two bytes in pwrErrBoundBytes
+	pwrErrBoundBytes[k++] = realPrecBytes[0];
+	pwrErrBoundBytes[k++] = realPrecBytes[1];
+}
+
+unsigned int optimize_intervals_double_1D_pwr(double *oriData, size_t dataLength, double* pwrErrBound)
+{	
+	size_t i = 0, j = 0;
+	double realPrecision = pwrErrBound[j++];	
+	unsigned long radiusIndex;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	int totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->segment_size==0)
+			realPrecision = pwrErrBound[j++];
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+void compute_segment_precisions_double_2D(double *oriData, double* pwrErrBound, 
+size_t r1, size_t r2, size_t R2, size_t edgeSize, unsigned char* pwrErrBoundBytes, double Min, double Max, double globalPrecision)
+{
+	size_t i = 0, j = 0, k = 0, p = 0, index = 0, J = 0; //I=-1,J=-1 if they are needed
+	double realPrecision; 
+	double approxPrecision;
+	unsigned char realPrecBytes[8];
+	double curValue, curAbsValue;
+	double* statAbsValues = (double*)malloc(R2*sizeof(double));	
+	
+	double max = fabs(Min)<fabs(Max)?fabs(Max):fabs(Min); //get the max abs value.
+	double min = fabs(Min)<fabs(Max)?fabs(Min):fabs(Max);
+	for(i=0;i<R2;i++)
+	{
+		if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+			statAbsValues[i] = max;
+		else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+			statAbsValues[i] = min;
+		else
+			statAbsValues[i] = 0; //for SZ_PWR_AVG_TYPE
+	}
+	for(i=0;i<r1;i++)
+	{
+		for(j=0;j<r2;j++)
+		{
+			index = i*r2+j;
+			curValue = oriData[index];				
+			if(((i%edgeSize==edgeSize-1 || i==r1-1) &&j%edgeSize==0&&j>0) || (i%edgeSize==0&&j==0&&i>0))
+			{
+				if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+				{
+					int a = edgeSize, b = edgeSize;
+					if(j==0)
+					{
+						if(r2%edgeSize==0) 
+							b = edgeSize;
+						else
+							b = r2%edgeSize;
+					}
+					if(i==r1-1)
+					{
+						if(r1%edgeSize==0)
+							a = edgeSize;
+						else
+							a = r1%edgeSize;
+					}
+					realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J]/(a*b);
+				}
+				else
+					realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J];
+
+				if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+					realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+				else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+					realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;
+					
+				doubleToBytes(realPrecBytes, realPrecision);
+				memset(&realPrecBytes[2], 0, 6);
+				approxPrecision = bytesToDouble(realPrecBytes);
+				//put the realPrecision in double* pwrErBound
+				pwrErrBound[p++] = approxPrecision;
+				//put the two bytes in pwrErrBoundBytes
+				pwrErrBoundBytes[k++] = realPrecBytes[0];
+				pwrErrBoundBytes[k++] = realPrecBytes[1];	
+				
+				if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+					statAbsValues[J] = max;
+				else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+					statAbsValues[J] = min;
+				else
+					statAbsValues[J] = 0; //for SZ_PWR_AVG_TYPE		
+			}	
+			if(j==0)
+				J = 0;
+			else if(j%edgeSize==0)
+				J++;			
+			if(curValue!=0)
+			{
+				curAbsValue = fabs(curValue);
+				
+				switch(confparams_cpr->pwr_type)
+				{
+				case SZ_PWR_MIN_TYPE: 
+					if(statAbsValues[J]>curAbsValue)
+						statAbsValues[J] = curAbsValue;	
+					break;
+				case SZ_PWR_AVG_TYPE:
+					statAbsValues[J] += curAbsValue;
+					break;
+				case SZ_PWR_MAX_TYPE:
+					if(statAbsValues[J]<curAbsValue)
+						statAbsValues[J] = curAbsValue;					
+					break;
+				}
+			}
+		}
+	}
+		
+	if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+	{
+		int a = edgeSize, b = edgeSize;
+		if(r2%edgeSize==0) 
+			b = edgeSize;
+		else
+			b = r2%edgeSize;
+		if(r1%edgeSize==0)
+			a = edgeSize;
+		else
+			a = r1%edgeSize;
+		realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J]/(a*b);
+	}
+	else
+		realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J];		
+
+	if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+		realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+	else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+		realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;
+		
+	doubleToBytes(realPrecBytes, realPrecision);
+	realPrecBytes[2] = realPrecBytes[3] = 0;
+	approxPrecision = bytesToDouble(realPrecBytes);
+	//put the realPrecision in double* pwrErBound
+	pwrErrBound[p++] = approxPrecision;
+	//put the two bytes in pwrErrBoundBytes
+	pwrErrBoundBytes[k++] = realPrecBytes[0];
+	pwrErrBoundBytes[k++] = realPrecBytes[1];	
+	
+	free(statAbsValues);
+}
+
+unsigned int optimize_intervals_double_2D_pwr(double *oriData, size_t r1, size_t r2, size_t R2, size_t edgeSize, double* pwrErrBound)
+{	
+	size_t i = 0,j = 0, index, I=0, J=0;
+	double realPrecision = pwrErrBound[0];	
+	unsigned long radiusIndex;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	size_t ir2;
+	for(i=1;i<r1;i++)
+	{
+		ir2 = i*r2;
+		if(i%edgeSize==0)
+		{	
+			I++;
+			J = 0;
+		}
+		for(j=1;j<r2;j++)
+		{
+			index = ir2+j;
+			if(j%edgeSize==0)
+				J++;
+				
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				realPrecision = pwrErrBound[I*R2+J];
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = fabs(pred_value - oriData[index]);
+				radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+void compute_segment_precisions_double_3D(double *oriData, double* pwrErrBound, 
+size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, unsigned char* pwrErrBoundBytes, double Min, double Max, double globalPrecision)
+{
+	size_t i = 0, j = 0, k = 0, p = 0, q = 0, index = 0, J = 0, K = 0; //I=-1,J=-1 if they are needed
+	size_t r23 = r2*r3, ir, jr;
+	double realPrecision; 
+	double approxPrecision;
+	unsigned char realPrecBytes[8];
+	double curValue, curAbsValue;
+	
+	double** statAbsValues = create2DArray_double(R2, R3);
+	double max = fabs(Min)<fabs(Max)?fabs(Max):fabs(Min); //get the max abs value.	
+	double min = fabs(Min)<fabs(Max)?fabs(Min):fabs(Max);
+	for(i=0;i<R2;i++)
+		for(j=0;j<R3;j++)
+		{
+			if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+				statAbsValues[i][j] = max;
+			else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+				statAbsValues[i][j] = min;
+			else
+				statAbsValues[i][j] = 0;
+		}
+	for(i=0;i<r1;i++)
+	{
+		ir = i*r23;		
+		if(i%edgeSize==0&&i>0)
+		{
+			realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+			doubleToBytes(realPrecBytes, realPrecision);
+			memset(&realPrecBytes[2], 0, 6);
+			approxPrecision = bytesToDouble(realPrecBytes);
+			//put the realPrecision in double* pwrErBound
+			pwrErrBound[p++] = approxPrecision;
+			//put the two bytes in pwrErrBoundBytes
+			//printf("q=%d, i=%d, j=%d, k=%d\n",q,i,j,k);
+			pwrErrBoundBytes[q++] = realPrecBytes[0];
+			pwrErrBoundBytes[q++] = realPrecBytes[1];
+			if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+				statAbsValues[J][K] = max;
+			else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+				statAbsValues[J][K] = min;
+		}		
+		for(j=0;j<r2;j++)
+		{
+			jr = j*r3;
+			if((i%edgeSize==edgeSize-1 || i == r1-1)&&j%edgeSize==0&&j>0)
+			{
+				realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+				doubleToBytes(realPrecBytes, realPrecision);
+				memset(&realPrecBytes[2], 0, 6);
+				approxPrecision = bytesToDouble(realPrecBytes);
+				//put the realPrecision in double* pwrErBound
+				pwrErrBound[p++] = approxPrecision;
+				//put the two bytes in pwrErrBoundBytes
+				//printf("q=%d, i=%d, j=%d, k=%d\n",q,i,j,k);
+				pwrErrBoundBytes[q++] = realPrecBytes[0];
+				pwrErrBoundBytes[q++] = realPrecBytes[1];
+				if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+					statAbsValues[J][K] = max;
+				else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+					statAbsValues[J][K] = min;			
+			}
+			
+			if(j==0)
+				J = 0;
+			else if(j%edgeSize==0)
+				J++;					
+			
+			for(k=0;k<r3;k++)
+			{
+				index = ir+jr+k;				
+				curValue = oriData[index];				
+				if((i%edgeSize==edgeSize-1 || i == r1-1)&&(j%edgeSize==edgeSize-1||j==r2-1)&&k%edgeSize==0&&k>0)
+				{
+					realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+					doubleToBytes(realPrecBytes, realPrecision);
+					memset(&realPrecBytes[2], 0, 6);
+					approxPrecision = bytesToDouble(realPrecBytes);
+					//put the realPrecision in double* pwrErBound
+					pwrErrBound[p++] = approxPrecision;
+					//put the two bytes in pwrErrBoundBytes
+					//printf("q=%d, i=%d, j=%d, k=%d\n",q,i,j,k);
+					pwrErrBoundBytes[q++] = realPrecBytes[0];
+					pwrErrBoundBytes[q++] = realPrecBytes[1];
+					
+					if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+						statAbsValues[J][K] = max;
+					else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+						statAbsValues[J][K] = min;	
+				}	
+
+				if(k==0)
+					K = 0;
+				else if(k%edgeSize==0)
+					K++;
+					
+				if(curValue!=0)
+				{
+					curAbsValue = fabs(curValue);
+					if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+					{
+						if(statAbsValues[J][K]>curAbsValue)
+						{
+							statAbsValues[J][K] = curAbsValue;
+						}
+					}
+					else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+					{
+						if(statAbsValues[J][K]<curAbsValue)
+						{
+							statAbsValues[J][K] = curAbsValue;
+						}
+					}
+				}
+			}			
+		}
+	}	
+	
+	realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+	doubleToBytes(realPrecBytes, realPrecision);
+	memset(&realPrecBytes[2], 0, 6);
+	approxPrecision = bytesToDouble(realPrecBytes);
+	//put the realPrecision in double* pwrErBound
+	pwrErrBound[p++] = approxPrecision;
+	//put the two bytes in pwrErrBoundBytes
+	pwrErrBoundBytes[q++] = realPrecBytes[0];
+	pwrErrBoundBytes[q++] = realPrecBytes[1];
+	
+	free2DArray_double(statAbsValues, R2);
+}
+
+unsigned int optimize_intervals_double_3D_pwr(double *oriData, size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, double* pwrErrBound)
+{	
+	size_t i,j,k, ir,jr,index, I = 0,J=0,K=0;
+	double realPrecision = pwrErrBound[0];		
+	unsigned long radiusIndex;
+	size_t r23=r2*r3;
+	size_t R23 = R2*R3;
+	double pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		ir = i*r23;
+		if(i%edgeSize==0)
+		{	
+			I++;
+			J = 0;
+		}
+		for(j=1;j<r2;j++)
+		{
+			jr = j*r3;
+			if(j%edgeSize==0)
+			{	
+				J++;
+				K = 0;
+			}			
+			for(k=1;k<r3;k++)
+			{
+				index = ir+jr+k;
+				if(k%edgeSize==0)
+					K++;		
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					realPrecision = pwrErrBound[I*R23+J*R2+K];					
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = fabs(pred_value - oriData[index]);
+					radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, 
+size_t dataLength, size_t *outSize, double min, double max)
+{
+	size_t pwrLength = dataLength%confparams_cpr->segment_size==0?dataLength/confparams_cpr->segment_size:dataLength/confparams_cpr->segment_size+1;
+	double* pwrErrBound = (double*)malloc(sizeof(double)*pwrLength);
+	size_t pwrErrBoundBytes_size = sizeof(unsigned char)*pwrLength*2;
+	unsigned char* pwrErrBoundBytes = (unsigned char*)malloc(pwrErrBoundBytes_size);
+	
+	compute_segment_precisions_double_1D(oriData, dataLength, pwrErrBound, pwrErrBoundBytes, globalPrecision);
+
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_1D_pwr(oriData, dataLength, pwrErrBound);	
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i = 0, j = 0;
+	int reqLength;
+	double realPrecision = pwrErrBound[j++];	
+	double medianValue = 0;
+	double radius = fabs(max)<fabs(min)?fabs(min):fabs(max);
+	short radExpo = getExponent_double(radius);
+	
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	double* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *resiBitLengthArray;
+	new_DBA(&resiBitLengthArray, DynArrayInitLen);
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	
+	unsigned char preDataBytes[8] = {0};
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	double last3CmprsData[3] = {0};
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+						
+	//add the first data	
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+	//printf("%.30G\n",last3CmprsData[0]);	
+		
+	//add the second data
+	type[1] = 0;
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);			
+	compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_double(last3CmprsData, vce->data);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius;
+	double curData;
+	double pred;
+	double predAbsErr;
+	checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	double interval = 2*realPrecision;
+	int updateReqLength = 0; //a marker: 1 means already updated
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		if(i%confparams_cpr->segment_size==0)
+		{
+			realPrecision = pwrErrBound[j++];
+			checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+			interval = 2*realPrecision;
+			updateReqLength = 0;
+		}
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = fabs(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			listAdd_double(last3CmprsData, pred);			
+			continue;
+		}
+		
+		//unpredictable data processing		
+		if(updateReqLength==0)
+		{
+			computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);				
+			reqBytesLength = reqLength/8;
+			resiBitsLength = reqLength%8;
+			updateReqLength = 1;		
+		}
+		
+		type[i] = 0;
+		addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+		
+		compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+		listAdd_double(last3CmprsData, vce->data);	
+	}//end of for
+		
+//	char* expSegmentsInBytes;
+//	int expSegmentsInBytes_size = convertESCToBytes(esc, &expSegmentsInBytes);
+	int exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	new_TightDataPointStorageD2(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitLengthArray->array, resiBitLengthArray->size, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, pwrErrBoundBytes, pwrErrBoundBytes_size, radExpo);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);
+*/
+//	writeUShortData(type, dataLength, "compressStateBytes.sb");
+//	unsigned short type_[dataLength];
+//	SZ_Reset();
+//	decode_withTree(tdps->typeArray, tdps->typeArray_size, type_);	
+//	printf("tdps->typeArray_size=%d\n", tdps->typeArray_size);
+		
+	//free memory
+	free_DBA(resiBitLengthArray);
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+		
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+	
+	int doubleSize=sizeof(double);
+	if(*outSize>dataLength*doubleSize)
+	{
+		size_t k = 0, i;
+		tdps->isLossless = 1;
+		size_t totalByteLength = 3 + exe_params->SZ_SIZE_TYPE + 1 + doubleSize*dataLength;
+		*newByteData = (unsigned char*)malloc(totalByteLength);
+		
+		unsigned char dsLengthBytes[exe_params->SZ_SIZE_TYPE];
+		intToBytes_bigEndian(dsLengthBytes, dataLength);//4
+		for (i = 0; i < 3; i++)//3
+			(*newByteData)[k++] = versionNumber[i];
+		
+		if(exe_params->SZ_SIZE_TYPE==4)
+		{
+			(*newByteData)[k++] = 16;	//=00010000	
+		}
+		else 
+		{
+			(*newByteData)[k++] = 80;
+		}
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)//4 or 8
+			(*newByteData)[k++] = dsLengthBytes[i];
+
+		
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+			memcpy((*newByteData)+4+exe_params->SZ_SIZE_TYPE, oriData, dataLength*doubleSize);
+		else
+		{
+			unsigned char* p = (*newByteData)+4+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=doubleSize)
+				doubleToBytes(p, oriData[i]);
+		}
+		*outSize = totalByteLength;
+	}
+	
+	free(pwrErrBound);
+	
+	free(vce);
+	free(lce);
+	free_TightDataPointStorageD(tdps);
+	free(exactMidByteArray);
+}
+
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_double_NoCkRngeNoGzip_2D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, size_t r1, size_t r2,
+size_t *outSize, double min, double max)
+{
+	size_t dataLength=r1*r2;
+	int blockEdgeSize = computeBlockEdgeSize_2D(confparams_cpr->segment_size);
+	size_t R1 = 1+(r1-1)/blockEdgeSize;
+	size_t R2 = 1+(r2-1)/blockEdgeSize;
+	double* pwrErrBound = (double*)malloc(sizeof(double)*R1*R2);
+	size_t pwrErrBoundBytes_size = sizeof(unsigned char)*R1*R2*2;
+	unsigned char* pwrErrBoundBytes = (unsigned char*)malloc(pwrErrBoundBytes_size);
+	
+	compute_segment_precisions_double_2D(oriData, pwrErrBound, r1, r2, R2, blockEdgeSize, pwrErrBoundBytes, min, max, globalPrecision);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_2D_pwr(oriData, r1, r2, R2, blockEdgeSize, pwrErrBound);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;	
+	//printf("quantization_intervals=%d\n",quantization_intervals);
+	
+	size_t i=0,j=0,I=0,J=0; 
+	int reqLength;
+	double realPrecision = pwrErrBound[I*R2+J];	
+	double pred1D, pred2D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+	
+	P0 = (double*)malloc(r2*sizeof(double));
+	memset(P0, 0, r2*sizeof(double));
+	P1 = (double*)malloc(r2*sizeof(double));
+	memset(P1, 0, r2*sizeof(double));
+		
+	double medianValue = 0;
+	double radius = fabs(max)<fabs(min)?fabs(min):fabs(max);
+	short radExpo = getExponent_double(radius);
+	int updateReqLength = 1;
+	
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	double* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *resiBitLengthArray;
+	new_DBA(&resiBitLengthArray, DynArrayInitLen);
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+			
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{		
+		type[1] = 0;
+
+		addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+		compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		if(j%blockEdgeSize==0)
+		{
+			J++;
+			realPrecision = pwrErrBound[I*R2+J];
+			updateReqLength = 0;
+		}
+
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}
+
+			type[j] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleDoubleValue(vce, spaceFillingValue[j], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		J = 0;
+		if(i%blockEdgeSize==0)
+			I++;
+		realPrecision = pwrErrBound[I*R2+J]; //J==0
+		updateReqLength = 0;
+		
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}
+			
+			type[index] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			if(j%blockEdgeSize==0)
+			{
+				J++;
+				realPrecision = pwrErrBound[I*R2+J];
+				updateReqLength = 0;
+			}
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}
+
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+		
+	if(r2!=1)	
+		free(P0);
+	free(P1);
+	int exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	new_TightDataPointStorageD2(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitLengthArray->array, resiBitLengthArray->size, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, pwrErrBoundBytes, pwrErrBoundBytes_size, radExpo);
+	
+	//free memory
+	free_DBA(resiBitLengthArray);
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+		
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+
+	free(pwrErrBound);
+	
+	free(vce);
+	free(lce);
+	free_TightDataPointStorageD(tdps);	
+	free(exactMidByteArray);
+}
+
+void SZ_compress_args_double_NoCkRngeNoGzip_3D_pwr(unsigned char** newByteData, double *oriData, double globalPrecision, 
+size_t r1, size_t r2, size_t r3, size_t *outSize, double min, double max)
+{
+	size_t dataLength=r1*r2*r3;
+	
+	int blockEdgeSize = computeBlockEdgeSize_3D(confparams_cpr->segment_size);
+	size_t R1 = 1+(r1-1)/blockEdgeSize;
+	size_t R2 = 1+(r2-1)/blockEdgeSize;
+	size_t R3 = 1+(r3-1)/blockEdgeSize;
+	double* pwrErrBound = (double*)malloc(sizeof(double)*R1*R2*R3);
+	size_t pwrErrBoundBytes_size = sizeof(unsigned char)*R1*R2*R3*2;
+	unsigned char* pwrErrBoundBytes = (unsigned char*)malloc(pwrErrBoundBytes_size);	
+	
+	compute_segment_precisions_double_3D(oriData, pwrErrBound, r1, r2, r3, R2, R3, blockEdgeSize, pwrErrBoundBytes, min, max, globalPrecision);	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_double_3D_pwr(oriData, r1, r2, r3, R2, R3, blockEdgeSize, pwrErrBound);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i=0,j=0,k=0, I = 0, J = 0, K = 0;
+	int reqLength;
+	double realPrecision = pwrErrBound[0];		
+	double pred1D, pred2D, pred3D;
+	double diff = 0.0;
+	double itvNum = 0;
+	double *P0, *P1;
+
+	size_t r23 = r2*r3;
+	size_t R23 = R2*R3;
+	P0 = (double*)malloc(r23*sizeof(double));
+	P1 = (double*)malloc(r23*sizeof(double));
+	double radius = fabs(max)<fabs(min)?fabs(min):fabs(max);
+	double medianValue = 0;
+	short radExpo = getExponent_double(radius);
+	int updateReqLength = 0;
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+
+	double* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *resiBitLengthArray;
+	new_DBA(&resiBitLengthArray, DynArrayInitLen);
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	type[0] = 0;
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		if(updateReqLength==0)
+		{
+			computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+			reqBytesLength = reqLength/8;
+			resiBitsLength = reqLength%8;
+			updateReqLength = 1;
+		}		
+		
+		type[1] = 0;
+
+		addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+		compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		if(j%blockEdgeSize==0)
+		{
+			J++;
+			realPrecision = pwrErrBound[J];
+			updateReqLength = 0;
+		}		
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}			
+
+			type[j] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleDoubleValue(vce, spaceFillingValue[j], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	K = 0;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+
+		J = 0;
+		if(i%blockEdgeSize==0)
+			I++;
+		realPrecision = pwrErrBound[I*R3+J]; //J==0
+		updateReqLength = 0;
+
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}		
+						
+			type[index] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index] = vce->data;
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++) //note that this j refers to fastest dimension (lowest order)
+		{
+			index = i*r3+j;		
+			if(j%blockEdgeSize==0)
+			{
+				J++;
+				realPrecision = pwrErrBound[I*R3+J];
+				updateReqLength = 0;
+			}			
+		
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}						
+				
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index] = vce->data;
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;			
+		I = 0;
+		J = 0;
+		if(k%blockEdgeSize==0)
+			K++;
+		realPrecision = pwrErrBound[K*R23]; //J==0
+		updateReqLength = 0;
+		
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}					
+			
+			type[index] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			index = k*r23+j;	
+
+			if(j%blockEdgeSize==0)
+			{
+				J++;
+				realPrecision = pwrErrBound[K*R23+J];
+				updateReqLength = 0;			
+			}					
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}						
+				
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+
+			J = 0;
+			if(i%blockEdgeSize==0)
+				I++;
+			realPrecision = pwrErrBound[K*R23+I*R3+J]; //J==0
+			updateReqLength = 0;			
+			
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}						
+				
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				index = k*r23 + i*r3 + j;
+				
+				if(j%blockEdgeSize==0)
+				{
+					J++;
+					realPrecision = pwrErrBound[K*R23+I*R3+J];
+					updateReqLength = 0;			
+				}							
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					if(updateReqLength==0)
+					{
+						computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+						reqBytesLength = reqLength/8;
+						resiBitsLength = reqLength%8;
+						updateReqLength = 1;
+					}							
+					
+					type[index] = 0;
+
+					addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+					compressSingleDoubleValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,8);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+		}
+
+		double *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+	int exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageD* tdps;
+
+	new_TightDataPointStorageD2(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitLengthArray->array, resiBitLengthArray->size, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, pwrErrBoundBytes, pwrErrBoundBytes_size, radExpo);
+
+	convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+
+	//free memory
+	free_DBA(resiBitLengthArray);
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+
+	free(pwrErrBound);
+
+	free(vce);
+	free(lce);
+	free_TightDataPointStorageD(tdps);
+	free(exactMidByteArray);
+}
+
+void createRangeGroups_double(double** posGroups, double** negGroups, int** posFlags, int** negFlags)
+{
+	size_t size = GROUP_COUNT*sizeof(double);
+	size_t size2 = GROUP_COUNT*sizeof(int);
+	*posGroups = (double*)malloc(size);
+	*negGroups = (double*)malloc(size);
+	*posFlags = (int*)malloc(size2);
+	*negFlags = (int*)malloc(size2);
+	memset(*posGroups, 0, size);
+	memset(*negGroups, 0, size);
+	memset(*posFlags, 0, size2);
+	memset(*negFlags, 0, size2);
+}
+
+void compressGroupIDArray_double(char* groupID, TightDataPointStorageD* tdps)
+{
+	size_t dataLength = tdps->dataSeriesLength;
+	int* standGroupID = (int*)malloc(dataLength*sizeof(int));
+
+	size_t i;
+	standGroupID[0] = groupID[0]+GROUP_COUNT; //plus an offset such that it would not be a negative number.
+	char lastGroupIDValue = groupID[0], curGroupIDValue;
+	int offset = 2*(GROUP_COUNT + 2);
+	for(i=1; i<dataLength;i++)
+	{
+		curGroupIDValue = groupID[i];
+		standGroupID[i] = (curGroupIDValue - lastGroupIDValue) + offset; 
+		lastGroupIDValue = curGroupIDValue;
+	}
+	
+	unsigned char* out = NULL;
+	size_t outSize;
+	
+	HuffmanTree* huffmanTree = SZ_Reset();
+	encode_withTree(huffmanTree, standGroupID, dataLength, &out, &outSize);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	tdps->pwrErrBoundBytes = out; //groupIDArray
+	tdps->pwrErrBoundBytes_size = outSize;
+	
+	free(standGroupID);
+}
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_pwrGroup(double* oriData, size_t dataLength, int errBoundMode, 
+double absErrBound, double relBoundRatio, double pwrErrRatio, double valueRangeSize, double medianValue_f)
+{
+	size_t i;
+	double *posGroups, *negGroups, *groups;
+	double pos_01_group = 0, neg_01_group = 0; //[0,1] and [-1,0]
+	int *posFlags, *negFlags, *flags;
+	int pos_01_flag = 0, neg_01_flag = 0;
+	createRangeGroups_double(&posGroups, &negGroups, &posFlags, &negFlags);
+	size_t nbBins = (size_t)(1/pwrErrRatio);
+	if(nbBins%2==1)
+		nbBins++;
+	exe_params->intvRadius = nbBins;
+
+	int reqLength, status;
+	double medianValue = medianValue_f;
+	double realPrecision = (double)getRealPrecision_double(valueRangeSize, errBoundMode, absErrBound, relBoundRatio, &status);
+	if(realPrecision<0)
+		realPrecision = pwrErrRatio;
+	double realGroupPrecision; //precision (error) based on group ID
+	getPrecisionReqLength_double(realPrecision);
+	short radExpo = getExponent_double(valueRangeSize/2);
+	short lastGroupNum = 0, groupNum, grpNum = 0;
+	
+	double* groupErrorBounds = generateGroupErrBounds(errBoundMode, realPrecision, pwrErrRatio);
+	exe_params->intvRadius = generateGroupMaxIntervalCount(groupErrorBounds);
+	
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	char *groupID = (char*) malloc(dataLength*sizeof(char));
+	char *gp = groupID;
+		
+	double* spaceFillingValue = oriData; 
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	unsigned char preDataBytes[8];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+			
+	int state;
+	double curData, decValue;
+	double pred;
+	double predAbsErr;
+	double interval = 0;
+	
+	//add the first data	
+	type[0] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	
+	curData = spaceFillingValue[0];
+	groupNum = computeGroupNum_double(vce->data);
+	
+	if(curData > 0 && groupNum >= 0)
+	{
+		groups = posGroups;
+		flags = posFlags;
+		grpNum = groupNum;
+	}
+	else if(curData < 0 && groupNum >= 0)
+	{
+		groups = negGroups;
+		flags = negFlags;
+		grpNum = groupNum;
+	}
+	else if(curData >= 0 && groupNum == -1)
+	{
+		groups = &pos_01_group;
+		flags = &pos_01_flag;
+		grpNum = 0;
+	}
+	else //curData < 0 && groupNum == -1
+	{
+		groups = &neg_01_group;
+		flags = &neg_01_flag;
+		grpNum = 0;
+	}
+		
+	listAdd_double_group(groups, flags, groupNum, spaceFillingValue[0], vce->data, gp);
+	gp++;
+	
+	for(i=1;i<dataLength;i++)
+	{
+		curData = oriData[i];
+		//printf("i=%d, posGroups[3]=%f, negGroups[3]=%f\n", i, posGroups[3], negGroups[3]);
+		
+		groupNum = computeGroupNum_double(curData);
+		
+		if(curData > 0 && groupNum >= 0)
+		{
+			groups = posGroups;
+			flags = posFlags;
+			grpNum = groupNum;
+		}
+		else if(curData < 0 && groupNum >= 0)
+		{
+			groups = negGroups;
+			flags = negFlags;
+			grpNum = groupNum;
+		}
+		else if(curData >= 0 && groupNum == -1)
+		{
+			groups = &pos_01_group;
+			flags = &pos_01_flag;
+			grpNum = 0;
+		}
+		else //curData < 0 && groupNum == -1
+		{
+			groups = &neg_01_group;
+			flags = &neg_01_flag;
+			grpNum = 0;
+		}
+
+		if(groupNum>=GROUP_COUNT)
+		{
+			type[i] = 0;
+			compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			listAdd_double_group(groups, flags, lastGroupNum, curData, vce->data, gp);	//set the group number to be last one in order to get the groupID array as smooth as possible.		
+		}
+		else if(flags[grpNum]==0) //the dec value may not be in the same group
+		{	
+			type[i] = 0;
+			compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,8);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			//decGroupNum = computeGroupNum_double(vce->data);
+			
+			//if(decGroupNum < groupNum)
+			//	decValue = curData>0?pow(2, groupNum):-pow(2, groupNum);
+			//else if(decGroupNum > groupNum)
+			//	decValue = curData>0?pow(2, groupNum+1):-pow(2, groupNum+1);
+			//else
+			//	decValue = vce->data;
+			
+			decValue = vce->data;	
+			listAdd_double_group(groups, flags, groupNum, curData, decValue, gp);
+			lastGroupNum = curData>0?groupNum + 2: -(groupNum+2);
+		}
+		else //if flags[groupNum]==1, the dec value must be in the same group
+		{
+			pred = groups[grpNum];
+			predAbsErr = fabs(curData - pred);
+			realGroupPrecision = groupErrorBounds[grpNum]; //compute real error bound
+			interval = realGroupPrecision*2;
+			state = (predAbsErr/realGroupPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				decValue = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				decValue = pred - state*interval;
+			}
+			//decGroupNum = computeGroupNum_double(pred);
+			
+			if((decValue>0&&curData<0)||(decValue<0&&curData>=0))
+				decValue = 0;
+			//else
+			//{
+			//	if(decGroupNum < groupNum)
+			//		decValue = curData>0?pow(2, groupNum):-pow(2, groupNum);
+			//	else if(decGroupNum > groupNum)
+			//		decValue = curData>0?pow(2, groupNum+1):-pow(2, groupNum+1);
+			//	else
+			//		decValue = pred;				
+			//}
+			
+			if(fabs(curData-decValue)>realGroupPrecision)
+			{	
+				type[i] = 0;
+				compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,8);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+				decValue = vce->data;	
+			}
+			
+			listAdd_double_group(groups, flags, groupNum, curData, decValue, gp);			
+			lastGroupNum = curData>=0?groupNum + 2: -(groupNum+2);			
+		}
+		gp++;	
+
+	}
+	
+	int exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	//combineTypeAndGroupIDArray(nbBins, dataLength, &type, groupID);
+
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, nbBins, NULL, 0, radExpo);	
+	
+	compressGroupIDArray_double(groupID, tdps);
+	
+	free(posGroups);
+	free(negGroups);
+	free(posFlags);
+	free(negFlags);
+	free(groupID);
+	free(groupErrorBounds);
+	
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);	
+	
+	return tdps;
+}
+
+void SZ_compress_args_double_NoCkRngeNoGzip_1D_pwrgroup(unsigned char** newByteData, double *oriData,
+size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio, double valueRangeSize, double medianValue_f, size_t *outSize)
+{
+        TightDataPointStorageD* tdps = SZ_compress_double_1D_MDQ_pwrGroup(oriData, dataLength, confparams_cpr->errorBoundMode, 
+        absErrBound, relBoundRatio, pwrErrRatio, 
+        valueRangeSize, medianValue_f);
+
+        convertTDPStoFlatBytes_double(tdps, newByteData, outSize);
+
+        if(*outSize>dataLength*sizeof(double))
+                SZ_compress_args_double_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+
+        free_TightDataPointStorageD(tdps);
+}
diff --git a/thirdparty/SZ/sz/src/sz_double_ts.c b/thirdparty/SZ/sz/src/sz_double_ts.c
new file mode 100644
index 0000000000000000000000000000000000000000..b83562e05a525f87cabb2825f41e91820f5e2ba5
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_double_ts.c
@@ -0,0 +1,190 @@
+/**
+ *  @file sz_double_ts.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageD.h"
+#include "zlib.h"
+#include "rw.h"
+#include "sz_double_ts.h"
+
+unsigned int optimize_intervals_double_1D_ts(double *oriData, size_t dataLength, double* preData, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	double pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			pred_value = preData[i];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageD* SZ_compress_double_1D_MDQ_ts(double *oriData, size_t dataLength, sz_multisteps* multisteps,
+double realPrecision, double valueRangeSize, double medianValue_d)
+{
+double* preStepData = (double*)(multisteps->hist_data);
+	//store the decompressed data
+	double* decData = (double*)malloc(sizeof(double)*dataLength);
+	memset(decData, 0, sizeof(double)*dataLength);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_double_1D_ts(oriData, dataLength, preStepData, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+
+	size_t i;
+	int reqLength;
+	double medianValue = medianValue_d;
+	short radExpo = getExponent_double(valueRangeSize/2);
+
+	computeReqLength_double(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	double* spaceFillingValue = oriData; //
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[8];
+	longToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	DoubleValueCompressElement *vce = (DoubleValueCompressElement*)malloc(sizeof(DoubleValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));			
+				
+	//add the first data	
+	type[0] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	decData[0] = vce->data;
+		
+	//add the second data
+	type[1] = 0;
+	compressSingleDoubleValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,8);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	decData[1] = vce->data;	
+	
+	int state = 0;
+	double checkRadius = 0;
+	double curData = 0;
+	double pred = 0;
+	double predAbsErr = 0;
+	checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	double interval = 2*realPrecision;
+
+	for(i=2;i<dataLength;i++)
+	{				
+		curData = spaceFillingValue[i];
+		pred = preStepData[i];
+		predAbsErr = fabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+				
+			continue;
+		}
+		
+		//unpredictable data processing
+		type[i] = 0;		
+		compressSingleDoubleValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Double(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,8);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		decData[i] = vce->data;
+	}//end of for
+		
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageD* tdps;
+			
+	new_TightDataPointStorageD(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+	
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+		
+	memcpy(preStepData, decData, dataLength*sizeof(float)); //update the data
+	free(decData);
+	
+	return tdps;
+}
+
+
diff --git a/thirdparty/SZ/sz/src/sz_float.c b/thirdparty/SZ/sz/src/sz_float.c
new file mode 100644
index 0000000000000000000000000000000000000000..c0a2a18b36adaa0bd9cd84d943606e7fcde8e792
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_float.c
@@ -0,0 +1,4038 @@
+/**
+ *  @file sz_float.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief SZ_Init, Compression and Decompression functions
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageF.h"
+#include "sz_float.h"
+#include "sz_float_pwr.h"
+#include "szd_float.h"
+#include "szd_float_pwr.h"
+#include "zlib.h"
+#include "rw.h"
+#include "sz_float_ts.h"
+
+unsigned char* SZ_skip_compress_float(float* data, size_t dataLength, size_t* outSize)
+{
+	*outSize = dataLength*sizeof(float);
+	unsigned char* out = (unsigned char*)malloc(dataLength*sizeof(float));
+	memcpy(out, data, dataLength*sizeof(float));
+	return out;
+}
+unsigned int optimize_intervals_float_1D(float *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_2D(float *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+
+	//float max = oriData[0];
+	//float min = oriData[0];
+
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = fabs(pred_value - oriData[index]);
+				radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+
+			//	if (max < oriData[index]) max = oriData[index];
+			//	if (min > oriData[index]) min = oriData[index];
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	//	struct timeval costStart, costEnd;
+	//	double cost_est = 0;
+	//
+	//	gettimeofday(&costStart, NULL);
+	//
+	//	//compute estimate of bit-rate and distortion
+	//	double est_br = 0;
+	//	double est_psnr = 0;
+	//	double c1 = log2(targetCount)+1;
+	//	double c2 = -20.0*log10(realPrecision) + 20.0*log10(max-min) + 10.0*log10(3);
+	//
+	//	for (i = 0; i < powerOf2/2; i++)
+	//	{
+	//		int count = intervals[i];
+	//		if (count != 0)
+	//			est_br += count*log2(count);
+	//		est_psnr += count;
+	//	}
+	//
+	//	//compute estimate of bit-rate
+	//	est_br -= c1*est_psnr;
+	//	est_br /= totalSampleSize;
+	//	est_br = -est_br;
+	//
+	//	//compute estimate of psnr
+	//	est_psnr /= totalSampleSize;
+	//	printf ("sum of P(i) = %lf\n", est_psnr);
+	//	est_psnr = -10.0*log10(est_psnr);
+	//	est_psnr += c2;
+	//
+	//	printf ("estimate bitrate = %.2f\n", est_br);
+	//	printf ("estimate psnr = %.2f\n",est_psnr);
+	//
+	//	gettimeofday(&costEnd, NULL);
+	//	cost_est = ((costEnd.tv_sec*1000000+costEnd.tv_usec)-(costStart.tv_sec*1000000+costStart.tv_usec))/1000000.0;
+	//
+	//	printf ("analysis time = %f\n", cost_est);
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_3D(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+
+	//float max = oriData[0];
+	//float min = oriData[0];
+
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = fabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+
+					//	if (max < oriData[index]) max = oriData[index];
+					//	if (min > oriData[index]) min = oriData[index];
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	//	struct timeval costStart, costEnd;
+	//	double cost_est = 0;
+	//
+	//	gettimeofday(&costStart, NULL);
+	//
+	//	//compute estimate of bit-rate and distortion
+	//	double est_br = 0;
+	//	double est_psnr = 0;
+	//	double c1 = log2(targetCount)+1;
+	//	double c2 = -20.0*log10(realPrecision) + 20.0*log10(max-min) + 10.0*log10(3);
+	//
+	//	for (i = 0; i < powerOf2/2; i++)
+	//	{
+	//		int count = intervals[i];
+	//		if (count != 0)
+	//			est_br += count*log2(count);
+	//		est_psnr += count;
+	//	}
+	//
+	//	//compute estimate of bit-rate
+	//	est_br -= c1*est_psnr;
+	//	est_br /= totalSampleSize;
+	//	est_br = -est_br;
+	//
+	//	//compute estimate of psnr
+	//	est_psnr /= totalSampleSize;
+	//	printf ("sum of P(i) = %lf\n", est_psnr);
+	//	est_psnr = -10.0*log10(est_psnr);
+	//	est_psnr += c2;
+	//
+	//	printf ("estimate bitrate = %.2f\n", est_br);
+	//	printf ("estimate psnr = %.2f\n",est_psnr);
+	//
+	//	gettimeofday(&costEnd, NULL);
+	//	cost_est = ((costEnd.tv_sec*1000000+costEnd.tv_usec)-(costStart.tv_sec*1000000+costStart.tv_usec))/1000000.0;
+	//
+	//	printf ("analysis time = %f\n", cost_est);
+
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_float_4D(float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = fabs(pred_value - oriData[index]);
+						radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ(float *oriData, 
+size_t dataLength, double realPrecision, float valueRangeSize, float medianValue_f)
+{
+#ifdef HAVE_TIMECMPR	
+	float* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (float*)(multisteps->hist_data);
+#endif	
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_float_1D_opt(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+
+	size_t i;
+	int reqLength;
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	float* spaceFillingValue = oriData; //
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	float last3CmprsData[3] = {0};
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+				
+	//add the first data	
+	type[0] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[0] = vce->data;
+#endif		
+		
+	//add the second data
+	type[1] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = vce->data;
+#endif
+	int state;
+	double checkRadius;
+	float curData;
+	float pred;
+	float predAbsErr;
+	checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{	
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = fabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+				
+			//double-check the prediction error in case of machine-epsilon impact	
+			if(fabs(curData-pred)>realPrecision)
+			{	
+				type[i] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);		
+				
+				listAdd_float(last3CmprsData, vce->data);	
+#ifdef HAVE_TIMECMPR					
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					decData[i] = vce->data;
+#endif					
+			}
+			else
+			{
+				listAdd_float(last3CmprsData, pred);
+#ifdef HAVE_TIMECMPR					
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					decData[i] = pred;			
+#endif	
+			}	
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;		
+		compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+		listAdd_float(last3CmprsData, vce->data);
+#ifdef HAVE_TIMECMPR
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[i] = vce->data;
+#endif	
+		
+	}//end of for
+		
+//	char* expSegmentsInBytes;
+//	int expSegmentsInBytes_size = convertESCToBytes(esc, &expSegmentsInBytes);
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_float_StoreOriData(float* oriData, size_t dataLength, TightDataPointStorageF* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int floatSize=sizeof(float);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + floatSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;	
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*floatSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=floatSize)
+			floatToBytes(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+char SZ_compress_args_float_NoCkRngeNoGzip_1D(unsigned char** newByteData, float *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f)
+{		
+	char compressionType = 0;	
+	TightDataPointStorageF* tdps = NULL;	
+
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		int timestep = sz_tsc->currentStep;
+		if(timestep % confparams_cpr->snapshotCmprStep != 0)
+		{
+			tdps = SZ_compress_float_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_f);
+			compressionType = 1; //time-series based compression 
+		}
+		else
+		{	
+			tdps = SZ_compress_float_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, medianValue_f);
+			compressionType = 0; //snapshot-based compression
+			multisteps->lastSnapshotStep = timestep;
+		}		
+	}
+	else
+#endif
+		tdps = SZ_compress_float_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, medianValue_f);	
+
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+	
+	if(*outSize>dataLength*sizeof(float))
+		SZ_compress_args_float_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageF(tdps);
+	return compressionType;
+}
+
+TightDataPointStorageF* SZ_compress_float_2D_MDQ(float *oriData, size_t r1, size_t r2, double realPrecision, float valueRangeSize, float medianValue_f)
+{
+#ifdef HAVE_TIMECMPR
+	float* decData = NULL;	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (float*)(multisteps->hist_data);
+#endif	
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_2D_opt(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int reqLength;
+	float pred1D, pred2D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (float*)malloc(r2*sizeof(float));
+	memset(P0, 0, r2*sizeof(float));
+	P1 = (float*)malloc(r2*sizeof(float));
+	memset(P1, 0, r2*sizeof(float));
+		
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+			
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[0] = vce->data;
+#endif	
+
+	float curData;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	curData = spaceFillingValue[1];
+	diff = curData - pred1D;
+
+	itvNum =  fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;	
+
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(spaceFillingValue[1]-P1[1])>realPrecision)
+		{	
+			type[1] = 0;			
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+			
+			P1[1] = vce->data;
+		}		
+	}
+	else
+	{
+		type[1] = 0;
+		compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = P1[1];
+#endif
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		curData = spaceFillingValue[j];
+		diff = curData - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[j])>realPrecision)
+			{	
+				type[j] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+				
+				P1[j] = vce->data;	
+			}
+		}
+		else
+		{
+			type[j] = 0;
+			compressSingleFloatValue(vce,curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[j] = P1[j];
+#endif		
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		curData = spaceFillingValue[index];
+		diff = curData - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P0[0])>realPrecision)
+			{	
+				type[index] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+				
+				P0[0] = vce->data;	
+			}
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P0[0];
+#endif
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			curData = spaceFillingValue[index];
+			diff = curData - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[j])>realPrecision)
+				{	
+					type[index] = 0;					
+					compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+					
+					P0[j] = vce->data;	
+				}			
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[j];
+#endif			
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+//	printf("exactDataNum=%d, expSegmentsInBytes_size=%d, exactMidByteArray->size=%d\n", 
+//			exactDataNum, expSegmentsInBytes_size, exactMidByteArray->size);
+	
+//	for(i = 3800;i<3844;i++)
+//		printf("exactLeadNumArray->array[%d]=%d\n",i,exactLeadNumArray->array[i]);
+	
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+char SZ_compress_args_float_NoCkRngeNoGzip_2D(unsigned char** newByteData, float *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f)
+{	
+	size_t dataLength = r1*r2;
+	char compressionType = 0;	
+	TightDataPointStorageF* tdps = NULL; 
+
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		int timestep = sz_tsc->currentStep;
+		if(timestep % confparams_cpr->snapshotCmprStep != 0)
+		{
+			tdps = SZ_compress_float_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_f);
+			compressionType = 1; //time-series based compression 
+		}
+		else
+		{	
+			tdps = SZ_compress_float_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, medianValue_f);
+			compressionType = 0; //snapshot-based compression
+			multisteps->lastSnapshotStep = timestep;
+		}		
+	}
+	else
+#endif
+		tdps = SZ_compress_float_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, medianValue_f);	
+
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+
+	if(*outSize>dataLength*sizeof(float))
+		SZ_compress_args_float_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageF(tdps);	
+	
+	return compressionType;
+}
+
+TightDataPointStorageF* SZ_compress_float_3D_MDQ(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, float valueRangeSize, float medianValue_f)
+{
+#ifdef HAVE_TIMECMPR	
+	float* decData = NULL;
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData = (float*)(multisteps->hist_data);
+#endif		
+
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_3D_opt(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int reqLength;
+	float pred1D, pred2D, pred3D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+
+	size_t dataLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+	P0 = (float*)malloc(r23*sizeof(float));
+	P1 = (float*)malloc(r23*sizeof(float));
+
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[0] = P1[0];
+#endif
+
+	float curData;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	curData = spaceFillingValue[1];
+	diff = curData - pred1D;
+
+	itvNum = fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(curData-P1[1])>realPrecision)
+		{	
+			type[1] = 0;			
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+			
+			P1[1] = vce->data;	
+		}				
+	}
+	else
+	{
+		type[1] = 0;
+		compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+#ifdef HAVE_TIMECMPR	
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+		decData[1] = P1[1];
+#endif
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		curData = spaceFillingValue[j];
+		diff = curData - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[j])>realPrecision)
+			{	
+				type[j] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+				
+				P1[j] = vce->data;	
+			}			
+		}
+		else
+		{
+			type[j] = 0;
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[j] = P1[j];
+#endif		
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		curData = spaceFillingValue[index];
+		diff = curData - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[index])>realPrecision)
+			{	
+				type[index] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+				
+				P1[index] = vce->data;	
+			}			
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P1[index];
+#endif		
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			curData = spaceFillingValue[index];
+			diff = curData - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P1[index])>realPrecision)
+				{	
+					type[index] = 0;					
+					compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+					
+					P1[index] = vce->data;	
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P1[index];
+#endif			
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		curData = spaceFillingValue[index];
+		diff = curData - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P0[0])>realPrecision)
+			{	
+				type[index] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+				
+				P0[0] = vce->data;	
+			}			
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+#ifdef HAVE_TIMECMPR	
+		if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+			decData[index] = P0[0];
+#endif
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			curData = spaceFillingValue[index];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[j])>realPrecision)
+				{	
+					type[index] = 0;					
+					compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+					
+					P0[j] = vce->data;	
+				}
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[j];
+#endif			
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			curData = spaceFillingValue[index];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[index2D])>realPrecision)
+				{	
+					type[index] = 0;					
+					compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+					
+					P0[index2D] = vce->data;	
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+#ifdef HAVE_TIMECMPR	
+			if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+				decData[index] = P0[index2D];
+#endif			
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				curData = spaceFillingValue[index];
+				diff = curData - pred3D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					
+					//ganrantee comporession error against the case of machine-epsilon
+					if(fabs(curData-P0[index2D])>realPrecision)
+					{	
+						type[index] = 0;						
+						compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+						updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+						memcpy(preDataBytes,vce->curBytes,4);
+						addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);	
+						
+						P0[index2D] = vce->data;	
+					}					
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+#ifdef HAVE_TIMECMPR	
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					decData[index] = P0[index2D];
+#endif				
+			}
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+
+
+//	printf("exactDataNum=%d, expSegmentsInBytes_size=%d, exactMidByteArray->size=%d\n",
+//			exactDataNum, expSegmentsInBytes_size, exactMidByteArray->size);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+char SZ_compress_args_float_NoCkRngeNoGzip_3D(unsigned char** newByteData, float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f)
+{
+	size_t dataLength = r1*r2*r3;
+	char compressionType = 0;	
+	TightDataPointStorageF* tdps = NULL; 
+
+#ifdef HAVE_TIMECMPR
+	if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+	{
+		int timestep = sz_tsc->currentStep;
+		if(timestep % confparams_cpr->snapshotCmprStep != 0)
+		{
+			tdps = SZ_compress_float_1D_MDQ_ts(oriData, dataLength, multisteps, realPrecision, valueRangeSize, medianValue_f);
+			compressionType = 1; //time-series based compression 
+		}
+		else
+		{	
+			tdps = SZ_compress_float_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
+			compressionType = 0; //snapshot-based compression
+			multisteps->lastSnapshotStep = timestep;
+		}		
+	}
+	else
+#endif
+		tdps = SZ_compress_float_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, medianValue_f);
+
+
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+
+	if(*outSize>dataLength*sizeof(float))
+		SZ_compress_args_float_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageF(tdps);
+	
+	return compressionType;
+}
+
+
+TightDataPointStorageF* SZ_compress_float_4D_MDQ(float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, float valueRangeSize, float medianValue_f)
+{	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+
+	size_t i,j,k; 
+	int reqLength;
+	float pred1D, pred2D, pred3D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+
+	size_t dataLength = r1*r2*r3*r4;
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (float*)malloc(r34*sizeof(float));
+	P1 = (float*)malloc(r34*sizeof(float));
+
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[index2D] = vce->data;
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P1[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P1[index2D] = vce->data;
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = fabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[index] = 0;
+						compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+						updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+						memcpy(preDataBytes,vce->curBytes,4);
+						addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+						P0[index2D] = vce->data;
+					}
+				}
+			}
+
+			float *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+char SZ_compress_args_float_NoCkRngeNoGzip_4D(unsigned char** newByteData, float *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f)
+{
+	TightDataPointStorageF* tdps = SZ_compress_float_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, medianValue_f);
+
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+
+	int dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(float))
+		SZ_compress_args_float_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageF(tdps);
+	
+	return 0;
+}
+
+void SZ_compress_args_float_withinRange(unsigned char** newByteData, float *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageF* tdps = (TightDataPointStorageF*) malloc(sizeof(TightDataPointStorageF));
+	tdps->rtypeArray = NULL;
+	tdps->typeArray = NULL;	
+	tdps->leadNumArray = NULL;
+	tdps->residualMidBits = NULL;
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactMidBytes = (unsigned char*)malloc(sizeof(unsigned char)*4);
+	tdps->pwrErrBoundBytes = NULL;
+	tdps->isLossless = 0;
+	float value = oriData[0];
+	floatToBytes(tdps->exactMidBytes, value);
+	tdps->exactMidBytes_size = 4;
+	
+	size_t tmpOutSize;
+	//unsigned char *tmpByteData;
+	convertTDPStoFlatBytes_float(tdps, newByteData, &tmpOutSize);
+
+	//*newByteData = (unsigned char*)malloc(sizeof(unsigned char)*12); //for floating-point data (1+3+4+4)
+	//memcpy(*newByteData, tmpByteData, 12);
+	*outSize = tmpOutSize; //8+SZ_SIZE_TYPE; //8==3+1+4(float_size)
+	free_TightDataPointStorageF(tdps);	
+}
+
+int SZ_compress_args_float_wRngeNoGzip(unsigned char** newByteData, float *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwrErrRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	float valueRangeSize = 0, medianValue = 0;
+	
+	float min = computeRangeSize_float(oriData, dataLength, &valueRangeSize, &medianValue);
+	float max = min+valueRangeSize;
+	double realPrecision = getRealPrecision_float(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_float_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_float_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			if(errBoundMode>=PW_REL)
+			{	
+				//SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr(newByteData, oriData, realPrecision, r1, outSize, min, max);
+				SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(newByteData, oriData, r1, absErr_Bound, relBoundRatio, pwrErrRatio, valueRangeSize, medianValue, outSize);
+			}
+			else
+				SZ_compress_args_float_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			if(errBoundMode>=PW_REL)
+				SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr(newByteData, oriData, realPrecision, r2, r1, outSize, min, max);
+			else
+				SZ_compress_args_float_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			if(errBoundMode>=PW_REL)
+				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(newByteData, oriData, realPrecision, r3, r2, r1, outSize, min, max);
+			else
+				SZ_compress_args_float_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+		else if(r5==0)
+		{
+			if(errBoundMode>=PW_REL)
+				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(newByteData, oriData, realPrecision, r4*r3, r2, r1, outSize, min, max);
+			else
+				SZ_compress_args_float_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, medianValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_float(unsigned char** newByteData, float *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio, double pwRelBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	if(errBoundMode==PW_REL)
+	{
+		confparams_cpr->pw_relBoundRatio = pwRelBoundRatio;	
+		//confparams_cpr->pwr_type = SZ_PWR_MIN_TYPE;
+		if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE && r3 != 0 )
+		{
+			printf("Error: Current version doesn't support 3D data compression with point-wise relative error bound being based on pwrType=AVG\n");
+			exit(0);
+			return SZ_NSCS;
+		}
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	if(dataLength <= MIN_NUM_OF_ELEMENTS)
+	{
+		*newByteData = SZ_skip_compress_float(oriData, dataLength, outSize);
+		return status;
+	}
+	
+	float valueRangeSize = 0, medianValue = 0;
+	
+	float min = computeRangeSize_float(oriData, dataLength, &valueRangeSize, &medianValue);
+	float max = min+valueRangeSize;
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_float(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_float_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		
+		if (r2==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+			{
+				//SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr(&tmpByteData, oriData, realPrecision, r1, &tmpOutSize, min, max);
+				SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(&tmpByteData, oriData, r1, absErr_Bound, relBoundRatio, pwRelBoundRatio, 
+				valueRangeSize, medianValue, &tmpOutSize);
+			}
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)
+					multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif				
+					SZ_compress_args_float_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+		}
+		else
+		if (r3==0)
+		{			
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+				SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr(&tmpByteData, oriData, realPrecision, r2, r1, &tmpOutSize, min, max);
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)				
+					multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif
+					SZ_compress_args_float_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+		}
+		else
+		if (r4==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)
+				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(&tmpByteData, oriData, realPrecision, r3, r2, r1, &tmpOutSize, min, max);
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)				
+					multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif
+					SZ_compress_args_float_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+		}
+		else
+		if (r5==0)
+		{
+			if(confparams_cpr->errorBoundMode>=PW_REL)		
+				SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(&tmpByteData, oriData, realPrecision, r4*r3, r2, r1, &tmpOutSize, min, max);
+				//ToDO
+				//SZ_compress_args_float_NoCkRngeNoGzip_4D_pwr(&tmpByteData, oriData, r4, r3, r2, r1, &tmpOutSize, min, max);
+			else
+#ifdef HAVE_TIMECMPR
+				if(confparams_cpr->szMode == SZ_TEMPORAL_COMPRESSION)				
+					multisteps->compressionType = SZ_compress_args_float_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+				else
+#endif
+					SZ_compress_args_float_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, medianValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION || confparams_cpr->szMode==SZ_TEMPORAL_COMPRESSION)
+		{
+			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the float compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
+
+
+void computeReqLength_float(double realPrecision, short radExpo, int* reqLength, float* medianValue)
+{
+	short reqExpo = getPrecisionReqLength_double(realPrecision);
+	*reqLength = 9+radExpo - reqExpo; //radExpo-reqExpo == reqMantiLength
+	if(*reqLength<9)
+		*reqLength = 9;
+	if(*reqLength>32)
+	{	
+		*reqLength = 32;
+		*medianValue = 0;
+	}			
+}
+
+//TODO
+int SZ_compress_args_float_subblock(unsigned char* compressedBytes, float *oriData,
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1,
+size_t s5, size_t s4, size_t s3, size_t s2, size_t s1,
+size_t e5, size_t e4, size_t e3, size_t e2, size_t e1,
+size_t *outSize, int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	float valueRangeSize = 0, medianValue = 0;
+	computeRangeSize_float_subblock(oriData, &valueRangeSize, &medianValue, r5, r4, r3, r2, r1, s5, s4, s3, s2, s1, e5, e4, e3, e2, e1);
+
+	double realPrecision = getRealPrecision_float(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		//TODO
+		//SZ_compress_args_float_withinRange_subblock();
+	}
+	else
+	{
+		if (r2==0)
+		{
+			if(errBoundMode>=PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_float_NoCkRnge_1D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r1, s1, e1);
+		}
+		else
+		if (r3==0)
+		{
+			//TODO
+			if(errBoundMode>=PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_float_NoCkRnge_2D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r2, r1, s2, s1, e2, e1);
+		}
+		else
+		if (r4==0)
+		{
+			if(errBoundMode>=PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_float_NoCkRnge_3D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r3, r2, r1, s3, s2, s1, e3, e2, e1);
+		}
+		else
+		if (r5==0)
+		{
+			if(errBoundMode>=PW_REL)
+			{
+				//TODO
+				//SZ_compress_args_float_NoCkRngeNoGzip_4D_pwr_subblock();
+				printf ("Current subblock version does not support point-wise relative error bound.\n");
+			}
+			else
+				SZ_compress_args_float_NoCkRnge_4D_subblock(compressedBytes, oriData, realPrecision, outSize, valueRangeSize, medianValue, r4, r3, r2, r1, s4, s3, s2, s1, e4, e3, e2, e1);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+	}
+	return status;
+}
+
+void SZ_compress_args_float_NoCkRnge_1D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r1, size_t s1, size_t e1)
+{
+	TightDataPointStorageF* tdps = SZ_compress_float_1D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_f, r1, s1, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_float_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_float(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(float))
+//		SZ_compress_args_float_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageF(tdps);
+}
+
+void SZ_compress_args_float_NoCkRnge_2D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r2, size_t r1, size_t s2, size_t s1, size_t e2, size_t e1)
+{
+	TightDataPointStorageF* tdps = SZ_compress_float_2D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_f, r2, r1, s2, s1, e2, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_float_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_float(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(float))
+//		SZ_compress_args_float_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageF(tdps);
+}
+
+void SZ_compress_args_float_NoCkRnge_3D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r3, size_t r2, size_t r1, size_t s3, size_t s2, size_t s1, size_t e3, size_t e2, size_t e1)
+{
+	TightDataPointStorageF* tdps = SZ_compress_float_3D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_f, r3, r2, r1, s3, s2, s1, e3, e2, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_float_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_float(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(float))
+//		SZ_compress_args_float_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageF(tdps);
+}
+
+void SZ_compress_args_float_NoCkRnge_4D_subblock(unsigned char* compressedBytes, float *oriData, double realPrecision, size_t *outSize, float valueRangeSize, float medianValue_f,
+size_t r4, size_t r3, size_t r2, size_t r1, size_t s4, size_t s3, size_t s2, size_t s1, size_t e4, size_t e3, size_t e2, size_t e1)
+{
+	TightDataPointStorageF* tdps = SZ_compress_float_4D_MDQ_subblock(oriData, realPrecision, valueRangeSize, medianValue_f, r4, r3, r2, r1, s4, s3, s2, s1, e4, e3, e2, e1);
+
+	if (confparams_cpr->szMode==SZ_BEST_SPEED)
+		convertTDPStoFlatBytes_float_args(tdps, compressedBytes, outSize);
+	else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+	{
+		unsigned char *tmpCompBytes;
+		size_t tmpOutSize;
+		convertTDPStoFlatBytes_float(tdps, &tmpCompBytes, &tmpOutSize);
+		*outSize = zlib_compress3(tmpCompBytes, tmpOutSize, compressedBytes, confparams_cpr->gzipMode);
+		free(tmpCompBytes);
+	}
+	else
+	{
+		printf ("Error: Wrong setting of confparams_cpr->szMode in the double compression.\n");
+	}
+
+	//TODO
+//	if(*outSize>dataLength*sizeof(float))
+//		SZ_compress_args_float_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageF(tdps);
+
+}
+
+unsigned int optimize_intervals_float_1D_subblock(float *oriData, double realPrecision, size_t r1, size_t s1, size_t e1)
+{
+	size_t dataLength = e1 - s1 + 1;
+	oriData = oriData + s1;
+
+	size_t i = 0;
+	unsigned long radiusIndex;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			pred_value = 2*oriData[i-1] - oriData[i-2];
+			//pred_value = oriData[i-1];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_2D_subblock(float *oriData, double realPrecision, size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2)
+{
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+
+	size_t i,j, index;
+	unsigned long radiusIndex;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = R1*R2/confparams_cpr->sampleDistance;
+	for(i=s1+1;i<=e1;i++)
+	{
+		for(j=s2+1;j<=e2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = fabs(pred_value - oriData[index]);
+				radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_3D_subblock(float *oriData, double realPrecision, size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3)
+{
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+
+	size_t r23 = r2*r3;
+
+	size_t i,j,k, index;
+	unsigned long radiusIndex;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = R1*R2*R3/confparams_cpr->sampleDistance;
+	for(i=s1+1;i<=e1;i++)
+	{
+		for(j=s2+1;j<=e2;j++)
+		{
+			for(k=s3+1;k<=e3;k++)
+			{
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23]
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = fabs(pred_value - oriData[index]);
+					radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_4D_subblock(float *oriData, double realPrecision,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4)
+{
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+	size_t R4 = e4 - s4 + 1;
+
+	size_t r34 = r3*r4;
+	size_t r234 = r2*r3*r4;
+
+	size_t i,j,k,l, index;
+	unsigned long radiusIndex;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = R1*R2*R3*R4/confparams_cpr->sampleDistance;
+	for(i=s1+1;i<=e1;i++)
+	{
+		for(j=s2+1;j<=e2;j++)
+		{
+			for(k=s3+1;k<=e3;k++)
+			{
+				for (l=s4+1;l<=e4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r4] + oriData[index-r34]
+									- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = fabs(pred_value - oriData[index]);
+						radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t s1, size_t e1)
+{
+	size_t dataLength = e1 - s1 + 1;
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_float_1D_subblock(oriData, realPrecision, r1, s1, e1);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);
+
+	size_t i; 
+	int reqLength;
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	float* spaceFillingValue = oriData + s1;
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	type[0] = 0;
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	float last3CmprsData[3] = {0};
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+	//add the first data
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+
+	//add the second data
+	type[1] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+
+	int state;
+	double checkRadius;
+	float curData;
+	float pred;
+	float predAbsErr;
+	checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	double interval = 2*realPrecision;
+
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		predAbsErr = fabs(curData - pred);
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+
+			listAdd_float(last3CmprsData, pred);
+			continue;
+		}
+
+		//unpredictable data processing
+		type[i] = 0;
+		compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+		listAdd_float(last3CmprsData, vce->data);
+	}
+
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+TightDataPointStorageF* SZ_compress_float_2D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t s1, size_t s2, size_t e1, size_t e2)
+{
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_2D_subblock(oriData, realPrecision, r1, r2, s1, s2, e1, e2);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+
+	size_t i,j; 
+	int reqLength;
+	float pred1D, pred2D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t dataLength = R1*R2;
+
+	P0 = (float*)malloc(R2*sizeof(float));
+	memset(P0, 0, R2*sizeof(float));
+	P1 = (float*)malloc(R2*sizeof(float));
+	memset(P1, 0, R2*sizeof(float));
+
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+	/* Process Row-s1 data s2*/
+	size_t gIndex;
+	size_t lIndex;
+
+	gIndex = s1*r2+s2;
+	lIndex = 0;
+
+	type[lIndex] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+
+	/* Process Row-s1 data s2+1*/
+	gIndex = s1*r2+(s2+1);
+	lIndex = 1;
+
+	pred1D = P1[0];
+	diff = spaceFillingValue[gIndex] - pred1D;
+
+	itvNum =  fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[lIndex] = 0;
+		compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+
+    /* Process Row-s1 data s2+2 --> data e2 */
+	for (j = 2; j < R2; j++)
+	{
+		gIndex = s1*r2+(s2+j);
+		lIndex = j;
+
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+	}
+
+	/* Process Row-s1+1 --> Row-e1 */
+	for (i = 1; i < R1; i++)
+	{
+		/* Process row-s1+i data s2 */
+		gIndex = (s1+i)*r2+s2;
+		lIndex = i*R2;
+
+		pred1D = P1[0];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+
+		/* Process row-s1+i data s2+1 --> e2 */
+		for (j = 1; j < R2; j++)
+		{
+			gIndex = (s1+i)*r2+(s2+j);
+			lIndex = i*R2+j;
+
+//			printf ("global index = %d, local index = %d\n", gIndex, lIndex);
+
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+TightDataPointStorageF* SZ_compress_float_3D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t r3, size_t s1, size_t s2, size_t s3, size_t e1, size_t e2, size_t e3)
+{
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_3D_subblock(oriData, realPrecision, r1, r2, r3, s1, s2, s3, e1, e2, e3);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+
+	size_t i,j,k; 
+	int reqLength;
+	float pred1D, pred2D, pred3D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+	size_t dataLength = R1*R2*R3;
+
+	size_t r23 = r2*r3;
+	size_t R23 = R2*R3;
+
+	P0 = (float*)malloc(R23*sizeof(float));
+	P1 = (float*)malloc(R23*sizeof(float));
+
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	///////////////////////////	Process layer-s1 ///////////////////////////
+	/* Process Row-s2 data s3*/
+	size_t gIndex; 	//global index
+	size_t lIndex; 	//local index
+	size_t index2D; 	//local 2D index
+
+	gIndex = s1*r23+s2*r3+s3;
+	lIndex = 0;
+	index2D = 0;
+
+	type[lIndex] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[index2D] = vce->data;
+
+	/* Process Row-s2 data s3+1*/
+	gIndex = s1*r23+s2*r3+s3+1;
+	lIndex = 1;
+	index2D = 1;
+
+	pred1D = P1[index2D-1];
+	diff = spaceFillingValue[gIndex] - pred1D;
+
+	itvNum = fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[lIndex] = 0;
+		compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[index2D] = vce->data;
+	}
+
+    /* Process Row-s2 data s3+2 --> data e3 */
+	for (j = 2; j < R3; j++)
+	{
+		gIndex = s1*r23+s2*r3+s3+j;
+		lIndex = j;
+		index2D = j;
+
+		pred1D = 2*P1[index2D-1] - P1[index2D-2];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+	}
+
+	/* Process Row-s2+1 --> Row-e2 */
+	for (i = 1; i < R2; i++)
+	{
+		/* Process row-s2+i data s3 */
+		gIndex = s1*r23+(s2+i)*r3+s3;
+		lIndex = i*R3;
+		index2D = i*R3;
+
+		pred1D  = P1[index2D-R3];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+
+		/* Process row-s2+i data s3+1 --> data e3*/
+		for (j = 1; j < R3; j++)
+		{
+			gIndex = s1*r23+(s2+i)*r3+s3+j;
+			lIndex = i*R3+j;
+			index2D = i*R3+j;
+
+			pred2D  = P1[index2D-1] + P1[index2D-R3] - P1[index2D-R3-1];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-s1+1 --> layer-e1 ///////////////////////////
+
+	for (k = 1; k < R1; k++)
+	{
+		/* Process Row-s2 data s3*/
+		gIndex = (s1+k)*r23+s2*r3+s3;
+		lIndex = k*R23;
+		index2D = 0;
+
+		pred1D = P1[index2D];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[index2D] = vce->data;
+		}
+
+	    /* Process Row-s2 data s3+1 --> data e3 */
+		for (j = 1; j < R3; j++)
+		{
+			gIndex = (s1+k)*r23+s2*r3+s3+j;
+			lIndex = k*R23+j;
+			index2D = j;
+
+			pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+		}
+
+	    /* Process Row-s2+1 --> Row-e2 */
+		for (i = 1; i < R2; i++)
+		{
+			/* Process Row-s2+i data s3 */
+			gIndex = (s1+k)*r23+(s2+i)*r3+s3;
+			lIndex = k*R23+i*R3;
+			index2D = i*R3;
+
+			pred2D = P0[index2D-R3] + P1[index2D] - P1[index2D-R3];
+			diff = spaceFillingValue[gIndex] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+			/* Process Row-s2+i data s3+1 --> data e3 */
+			for (j = 1; j < R3; j++)
+			{
+				gIndex = (s1+k)*r23+(s2+i)*r3+s3+j;
+				lIndex = k*R23+i*R3+j;
+				index2D = i*R3+j;
+
+//				printf ("global index = %d, local index = %d\n", gIndex, lIndex);
+
+				pred3D = P0[index2D-1] + P0[index2D-R3]+ P1[index2D] - P0[index2D-R3-1] - P1[index2D-R3] - P1[index2D-1] + P1[index2D-R3-1];
+				diff = spaceFillingValue[gIndex] - pred3D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+TightDataPointStorageF* SZ_compress_float_4D_MDQ_subblock(float *oriData, double realPrecision, float valueRangeSize, float medianValue_f,
+size_t r1, size_t r2, size_t r3, size_t r4, size_t s1, size_t s2, size_t s3, size_t s4, size_t e1, size_t e2, size_t e3, size_t e4)
+{
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_4D_subblock(oriData, realPrecision, r1, r2, r3, r4, s1, s2, s3, s4, e1, e2, e3, e4);
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+
+	size_t i,j,k; 
+	int reqLength;
+	float pred1D, pred2D, pred3D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+
+	size_t R1 = e1 - s1 + 1;
+	size_t R2 = e2 - s2 + 1;
+	size_t R3 = e3 - s3 + 1;
+	size_t R4 = e4 - s4 + 1;
+
+	size_t dataLength = R1*R2*R3*R4;
+
+	size_t r34 = r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t R34 = R3*R4;
+	size_t R234 = R2*R3*R4;
+
+	P0 = (float*)malloc(R34*sizeof(float));
+	P1 = (float*)malloc(R34*sizeof(float));
+
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	float* spaceFillingValue = oriData; //
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	size_t l;
+	for (l = 0; l < R1; l++)
+	{
+
+		///////////////////////////	Process layer-s2 ///////////////////////////
+		/* Process Row-s3 data s4*/
+		size_t gIndex; 	//global index
+		size_t lIndex; 	//local index
+		size_t index2D; 	//local 2D index
+
+		gIndex = (s1+l)*r234+s2*r34+s3*r4+s4;
+		lIndex = l*R234;
+		index2D = 0;
+
+		type[lIndex] = 0;
+		compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[index2D] = vce->data;
+
+		/* Process Row-s3 data s4+1*/
+		gIndex = (s1+l)*r234+s2*r34+s3*r4+s4+1;
+		lIndex = l*R234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = spaceFillingValue[gIndex] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[lIndex] = 0;
+			compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index2D] = vce->data;
+		}
+
+		/* Process Row-s3 data s4+2 --> data e4 */
+		for (j = 2; j < R4; j++)
+		{
+			gIndex = (s1+l)*r234+s2*r34+s3*r4+s4+j;
+			lIndex = l*R234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[gIndex] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+		}
+
+		/* Process Row-s3+1 --> Row-e3 */
+		for (i = 1; i < R3; i++)
+		{
+			/* Process row-s2+i data s3 */
+			gIndex = (s1+l)*r234+s2*r34+(s3+i)*r4+s4;
+			lIndex = l*R234+i*R4;
+			index2D = i*R4;
+
+			pred1D  = P1[index2D-R4];
+			diff = spaceFillingValue[gIndex] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index2D] = vce->data;
+			}
+
+			/* Process row-s3+i data s4+1 --> data e4*/
+			for (j = 1; j < R4; j++)
+			{
+				gIndex = (s1+l)*r234+s2*r34+(s3+i)*r4+s4+j;
+				lIndex = l*R234+i*R4+j;
+				index2D = i*R4+j;
+
+				pred2D  = P1[index2D-1] + P1[index2D-R4] - P1[index2D-R4-1];
+				diff = spaceFillingValue[gIndex] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+					P1[index2D] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P1[index2D] = vce->data;
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-s2+1 --> layer-e2 ///////////////////////////
+
+		for (k = 1; k < R2; k++)
+		{
+			/* Process Row-s3 data s4*/
+			gIndex = (s1+l)*r234+(s2+k)*r34+s3*r4+s4;
+			lIndex = l*R234+k*R34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[gIndex] - pred1D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred1D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[lIndex] = 0;
+				compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+			/* Process Row-s3 data s4+1 --> data e4 */
+			for (j = 1; j < R4; j++)
+			{
+				gIndex = (s1+l)*r234+(s2+k)*r34+s3*r4+s4+j;
+				lIndex = l*R234+k*R34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[gIndex] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+
+			/* Process Row-s3+1 --> Row-e3 */
+			for (i = 1; i < R3; i++)
+			{
+				/* Process Row-s3+i data s4 */
+				gIndex = (s1+l)*r234+(s2+k)*r34+(s3+i)*r4+s4;
+				lIndex = l*R234+k*R34+i*R4;
+				index2D = i*R4;
+
+				pred2D = P0[index2D-R4] + P1[index2D] - P1[index2D-R4];
+				diff = spaceFillingValue[gIndex] - pred2D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[lIndex] = 0;
+					compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+
+				/* Process Row-s3+i data s4+1 --> data e4 */
+				for (j = 1; j < R4; j++)
+				{
+					gIndex = (s1+l)*r234+(s2+k)*r34+(s3+i)*r4+s4+j;
+					lIndex = l*R234+k*R34+i*R4+j;
+					index2D = i*R4+j;
+
+//					printf ("global index = %d, local index = %d\n", gIndex, lIndex);
+
+					pred3D = P0[index2D-1] + P0[index2D-R4]+ P1[index2D] - P0[index2D-R4-1] - P1[index2D-R4] - P1[index2D-1] + P1[index2D-R4-1];
+					diff = spaceFillingValue[gIndex] - pred3D;
+
+					itvNum = fabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[lIndex] = (int) (itvNum/2) + exe_params->intvRadius;
+						P0[index2D] = pred3D + 2 * (type[lIndex] - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[lIndex] = 0;
+						compressSingleFloatValue(vce, spaceFillingValue[gIndex], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+						updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+						memcpy(preDataBytes,vce->curBytes,4);
+						addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+						P0[index2D] = vce->data;
+					}
+				}
+			}
+
+			float *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+
+	}
+
+	free(P0);
+	free(P1);
+	size_t exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	free(vce);
+	free(lce);
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+
+	return tdps;
+}
+
+unsigned int optimize_intervals_float_3D_opt(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;//(r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+
+	size_t offset_count = confparams_cpr->sampleDistance - 2; // count r3 offset
+	size_t offset_count_2;
+	float * data_pos = oriData + r23 + r3 + offset_count;
+	size_t n1_count = 1, n2_count = 1; // count i,j sum
+	size_t len = r1 * r2 * r3;
+	while(data_pos - oriData < len){
+		totalSampleSize++;
+		pred_value = data_pos[-1] + data_pos[-r3] + data_pos[-r23] - data_pos[-1-r23] - data_pos[-r3-1] - data_pos[-r3-r23] + data_pos[-r3-r23-1];
+		pred_err = fabs(pred_value - *data_pos);
+		radiusIndex = (pred_err/realPrecision+1)/2;
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+		{
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;
+			//printf("radiusIndex=%d\n", radiusIndex);
+		}
+		intervals[radiusIndex]++;
+		// printf("TEST: %ld, i: %ld\tj: %ld\tk: %ld\n", data_pos - oriData);
+		// fflush(stdout);
+		offset_count += confparams_cpr->sampleDistance;
+		if(offset_count >= r3){
+			n2_count ++;
+			if(n2_count == r2){
+				n1_count ++;
+				n2_count = 1;
+				data_pos += r3;
+			}
+			offset_count_2 = (n1_count + n2_count) % confparams_cpr->sampleDistance;
+			data_pos += (r3 + confparams_cpr->sampleDistance - offset_count) + (confparams_cpr->sampleDistance - offset_count_2);
+			offset_count = (confparams_cpr->sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += confparams_cpr->sampleDistance;
+	}	
+	// printf("sample_count: %ld\n", sample_count);
+	// fflush(stdout);
+	// if(*max_freq < 0.15) *max_freq *= 2;
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+size_t SZ_compress_float_3D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data){
+
+	size_t dim0_offset = dim_1 * dim_2;
+	size_t dim1_offset = dim_2;
+
+	// data_pos = block_ori_data;
+	// for(size_t i=0; i<block_dim_0; i++){
+	// 	for(size_t j=0; j<block_dim_1; j++){
+	// 		for(size_t k=0; k<block_dim_2; k++){
+	// 			sum += *data_pos;
+	// 			data_pos ++;
+	// 		}
+	// 		data_pos += dim1_offset - block_dim_2;
+	// 	}
+	// 	data_pos += dim0_offset - block_dim_1 * dim1_offset;
+	// }
+	// size_t num_elements = block_dim_0 * block_dim_1 * block_dim_2;
+	// if(num_elements > 0) mean[0] = sum / num_elements;
+	// else mean[0] = 0.0;
+	mean[0] = block_ori_data[0];
+
+	size_t unpredictable_count = 0;
+	size_t r1, r2, r3;
+	r1 = block_dim_0;
+	r2 = block_dim_1;
+	r3 = block_dim_2;
+
+	float * cur_data_pos = block_ori_data;
+	float curData;
+	float pred1D, pred2D, pred3D;
+	double itvNum;
+	double diff;
+	size_t i, j, k;
+	size_t r23 = r2*r3;
+	// Process Row-0 data 0
+	pred1D = mean[0];
+	curData = *cur_data_pos;
+	diff = curData - pred1D;
+	itvNum = fabs(diff)/realPrecision + 1;
+	if (itvNum < exe_params->intvCapacity){
+		if (diff < 0) itvNum = -itvNum;
+		type[0] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[0] = pred1D + 2 * (type[0] - exe_params->intvRadius) * realPrecision;
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(curData-P1[0])>realPrecision){	
+			type[0] = 0;
+			P1[0] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}		
+	}
+	else{
+		type[0] = 0;
+		P1[0] = curData;
+		unpredictable_data[unpredictable_count ++] = curData;
+	}
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	curData = cur_data_pos[1];
+	diff = curData - pred1D;
+	itvNum = fabs(diff)/realPrecision + 1;
+	if (itvNum < exe_params->intvCapacity){
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(curData-P1[1])>realPrecision){	
+			type[1] = 0;
+			P1[1] = curData;	
+			unpredictable_data[unpredictable_count ++] = curData;
+		}		
+	}
+	else{
+		type[1] = 0;
+		P1[1] = curData;
+		unpredictable_data[unpredictable_count ++] = curData;
+	}
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++){
+		pred1D = 2*P1[j-1] - P1[j-2];
+		curData = cur_data_pos[j];
+		diff = curData - pred1D;
+		itvNum = fabs(diff)/realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity){
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[j])>realPrecision){	
+				type[j] = 0;
+				P1[j] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else{
+			type[j] = 0;
+			P1[j] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+	}
+	cur_data_pos += dim1_offset;
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		curData = *cur_data_pos;
+		diff = curData - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[index])>realPrecision)
+			{	
+				type[index] = 0;
+				P1[index] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else
+		{
+			type[index] = 0;
+			P1[index] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			curData = cur_data_pos[j];
+			diff = curData - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P1[index])>realPrecision)
+				{	
+					type[index] = 0;
+					P1[index] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				P1[index] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+		}
+		cur_data_pos += dim1_offset;
+	}
+	cur_data_pos += dim0_offset - r2 * dim1_offset;
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		curData = *cur_data_pos;
+		diff = curData - pred1D;
+		itvNum = fabs(diff)/realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P0[0])>realPrecision)
+			{	
+				type[index] = 0;
+				P0[0] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else
+		{
+			type[index] = 0;
+			P0[0] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			curData = cur_data_pos[j];
+			diff = curData - pred2D;
+			itvNum = fabs(diff)/realPrecision + 1;
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[j])>realPrecision)
+				{	
+					type[index] = 0;
+					P0[j] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}
+			}
+			else
+			{
+				type[index] = 0;
+				P0[j] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+		}
+
+		cur_data_pos += dim1_offset;
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			curData = *cur_data_pos;
+			diff = curData - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[index2D])>realPrecision)
+				{	
+					type[index] = 0;
+					P0[index2D] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				P0[index2D] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				curData = cur_data_pos[j];
+				diff = curData - pred3D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					
+					//ganrantee comporession error against the case of machine-epsilon
+					if(fabs(curData-P0[index2D])>realPrecision)
+					{	
+						type[index] = 0;
+						P0[index2D] = curData;	
+						unpredictable_data[unpredictable_count ++] = curData;
+					}					
+				}
+				else
+				{
+					type[index] = 0;
+					P0[index2D] = curData;
+					unpredictable_data[unpredictable_count ++] = curData;
+				}
+			}
+			cur_data_pos += dim1_offset;
+		}
+		cur_data_pos += dim0_offset - r2 * dim1_offset;
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+
+	return unpredictable_count;
+}
+
+unsigned int optimize_intervals_float_2D_opt(float *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i;
+	size_t radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;//(r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+
+	//float max = oriData[0];
+	//float min = oriData[0];
+
+	size_t offset_count = confparams_cpr->sampleDistance - 1; // count r2 offset
+	size_t offset_count_2;
+	float * data_pos = oriData + r2 + offset_count;
+	size_t n1_count = 1; // count i sum
+	size_t len = r1 * r2;
+	while(data_pos - oriData < len){
+		totalSampleSize++;
+		pred_value = data_pos[-1] + data_pos[-r2] - data_pos[-r2-1];
+		pred_err = fabs(pred_value - *data_pos);
+		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;
+		intervals[radiusIndex]++;
+
+		offset_count += confparams_cpr->sampleDistance;
+		if(offset_count >= r2){
+			n1_count ++;
+			offset_count_2 = n1_count % confparams_cpr->sampleDistance;
+			data_pos += (r2 + confparams_cpr->sampleDistance - offset_count) + (confparams_cpr->sampleDistance - offset_count_2);
+			offset_count = (confparams_cpr->sampleDistance - offset_count_2);
+			if(offset_count == 0) offset_count ++;
+		}
+		else data_pos += confparams_cpr->sampleDistance;
+	}
+
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_float_1D_opt(float *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = 0;//dataLength/confparams_cpr->sampleDistance;
+
+	float * data_pos = oriData + 2;
+	while(data_pos - oriData < dataLength){
+		totalSampleSize++;
+		//pred_value = 2*data_pos[-1] - data_pos[-2];
+		pred_value = data_pos[-1];
+		pred_err = fabs(pred_value - *data_pos);
+		radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+		if(radiusIndex>=confparams_cpr->maxRangeRadius)
+			radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+		intervals[radiusIndex]++;
+
+		data_pos += confparams_cpr->sampleDistance;
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+size_t SZ_compress_float_1D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t block_dim_0, double realPrecision, int * type, float * unpredictable_data){
+
+	mean[0] = block_ori_data[0];
+	unsigned short unpredictable_count = 0;
+
+	float curData;
+	double itvNum;
+	double diff;
+	float last_over_thres = mean[0];
+	float pred1D;
+	size_t type_index = 0;
+	float * data_pos = block_ori_data;
+	for(size_t i=0; i<block_dim_0; i++){
+		curData = *data_pos;
+
+		pred1D = last_over_thres;
+		diff = curData - pred1D;
+		itvNum = fabs(diff)/realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity){
+			if (diff < 0) itvNum = -itvNum;
+			type[type_index] = (int) (itvNum/2) + exe_params->intvRadius;	
+			last_over_thres = pred1D + 2 * (type[type_index] - exe_params->intvRadius) * realPrecision;
+			if(fabs(curData-last_over_thres)>realPrecision){
+				type[type_index] = 0;
+				last_over_thres = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+
+		}
+		else{
+			type[type_index] = 0;
+			unpredictable_data[unpredictable_count ++] = curData;
+			last_over_thres = curData;
+		}
+		type_index ++;
+		data_pos ++;
+	}
+	return unpredictable_count;
+
+}
+
+size_t SZ_compress_float_2D_MDQ_RA_block(float * block_ori_data, float * mean, size_t dim_0, size_t dim_1, size_t block_dim_0, size_t block_dim_1, double realPrecision, float * P0, float * P1, int * type, float * unpredictable_data){
+
+	size_t dim0_offset = dim_1;
+	mean[0] = block_ori_data[0];
+
+	size_t unpredictable_count = 0;
+	size_t r1, r2;
+	r1 = block_dim_0;
+	r2 = block_dim_1;
+
+	float * cur_data_pos = block_ori_data;
+	float curData;
+	float pred1D, pred2D;
+	double itvNum;
+	double diff;
+	size_t i, j;
+	/* Process Row-0 data 0*/
+	curData = *cur_data_pos;
+	pred1D = mean[0];
+	diff = curData - pred1D;
+	itvNum = fabs(diff)/realPrecision + 1;
+	if (itvNum < exe_params->intvCapacity){
+		if (diff < 0) itvNum = -itvNum;
+		type[0] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[0] = pred1D + 2 * (type[0] - exe_params->intvRadius) * realPrecision;
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(curData-P1[0])>realPrecision){	
+			type[0] = 0;
+			P1[0] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}		
+	}
+	else{
+		type[0] = 0;
+		P1[0] = curData;
+		unpredictable_data[unpredictable_count ++] = curData;
+	}
+
+	/* Process Row-0 data 1*/
+	curData = cur_data_pos[1];
+	pred1D = P1[0];
+	diff = curData - pred1D;
+	itvNum = fabs(diff)/realPrecision + 1;
+	if (itvNum < exe_params->intvCapacity){
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		//ganrantee comporession error against the case of machine-epsilon
+		if(fabs(curData-P1[1])>realPrecision){	
+			type[1] = 0;
+			P1[1] = curData;	
+			unpredictable_data[unpredictable_count ++] = curData;
+		}		
+	}
+	else{
+		type[1] = 0;
+		P1[1] = curData;
+		unpredictable_data[unpredictable_count ++] = curData;
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		curData = cur_data_pos[j];
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = curData - pred1D;
+		itvNum = fabs(diff)/realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity){
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P1[j])>realPrecision){	
+				type[j] = 0;
+				P1[j] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else{
+			type[j] = 0;
+			P1[j] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+	}
+	cur_data_pos += dim0_offset;
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		curData = *cur_data_pos;
+		pred1D = P1[0];
+		diff = curData - pred1D;
+		itvNum = fabs(diff)/realPrecision + 1;
+		if (itvNum < exe_params->intvCapacity){
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			//ganrantee comporession error against the case of machine-epsilon
+			if(fabs(curData-P0[0])>realPrecision){	
+				type[index] = 0;
+				P0[0] = curData;	
+				unpredictable_data[unpredictable_count ++] = curData;
+			}			
+		}
+		else{
+			type[index] = 0;
+			P0[0] = curData;
+			unpredictable_data[unpredictable_count ++] = curData;
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			curData = cur_data_pos[j];
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = curData - pred2D;
+			itvNum = fabs(diff)/realPrecision + 1;
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				
+				//ganrantee comporession error against the case of machine-epsilon
+				if(fabs(curData-P0[j])>realPrecision)
+				{	
+					type[index] = 0;
+					P0[j] = curData;	
+					unpredictable_data[unpredictable_count ++] = curData;
+				}				
+			}
+			else
+			{
+				type[index] = 0;
+				P0[j] = curData;
+				unpredictable_data[unpredictable_count ++] = curData;
+			}
+		}
+		cur_data_pos += dim0_offset;
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	return unpredictable_count;
+}
+
diff --git a/thirdparty/SZ/sz/src/sz_float_pwr.c b/thirdparty/SZ/sz/src/sz_float_pwr.c
new file mode 100644
index 0000000000000000000000000000000000000000..644afddf46bf707abc6c500a8e1ba96b09020c5e
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_float_pwr.c
@@ -0,0 +1,1782 @@
+/**
+ *  @file sz_float_pwr.c
+ *  @author Sheng Di
+ *  @date Aug, 2016
+ *  @brief SZ_Init, Compression and Decompression functions
+ * This file contains the compression/decompression functions related to point-wise relative errors
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageF.h"
+#include "sz_float.h"
+#include "sz_float_pwr.h"
+#include "zlib.h"
+#include "rw.h"
+
+void compute_segment_precisions_float_1D(float *oriData, size_t dataLength, float* pwrErrBound, unsigned char* pwrErrBoundBytes, double globalPrecision)
+{
+	size_t i = 0, j = 0, k = 0;
+	float realPrecision = oriData[0]!=0?fabs(confparams_cpr->pw_relBoundRatio*oriData[0]):confparams_cpr->pw_relBoundRatio; 
+	float approxPrecision;
+	unsigned char realPrecBytes[4];
+	float curPrecision;
+	float curValue;
+	float sum = 0;
+	for(i=0;i<dataLength;i++)
+	{
+		curValue = oriData[i];
+		if(i%confparams_cpr->segment_size==0&&i>0)
+		{
+			//get two first bytes of the realPrecision
+			if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+			{
+				realPrecision = sum/confparams_cpr->segment_size;
+				sum = 0;			
+			}
+			realPrecision *= confparams_cpr->pw_relBoundRatio;
+			
+			if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+				realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+			else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+				realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;
+				
+			floatToBytes(realPrecBytes, realPrecision);
+			realPrecBytes[2] = realPrecBytes[3] = 0;
+			approxPrecision = bytesToFloat(realPrecBytes);
+			//put the realPrecision in float* pwrErBound
+			pwrErrBound[j++] = approxPrecision;
+			//put the two bytes in pwrErrBoundBytes
+			pwrErrBoundBytes[k++] = realPrecBytes[0];
+			pwrErrBoundBytes[k++] = realPrecBytes[1];
+			
+			realPrecision = fabs(curValue);
+		}
+		
+		if(curValue!=0)
+		{
+			curPrecision = fabs(curValue);
+			
+			switch(confparams_cpr->pwr_type)
+			{
+			case SZ_PWR_MIN_TYPE: 
+				if(realPrecision>curPrecision)
+					realPrecision = curPrecision;	
+				break;
+			case SZ_PWR_AVG_TYPE:
+				sum += curPrecision;
+				break;
+			case SZ_PWR_MAX_TYPE:
+				if(realPrecision<curPrecision)
+					realPrecision = curPrecision;					
+				break;
+			}
+		}
+	}
+	if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+	{
+		int size = dataLength%confparams_cpr->segment_size==0?confparams_cpr->segment_size:dataLength%confparams_cpr->segment_size;
+		realPrecision = sum/size;		
+	}	
+	if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+		realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+	else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+		realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;	
+	floatToBytes(realPrecBytes, realPrecision);
+	realPrecBytes[2] = realPrecBytes[3] = 0;
+	approxPrecision = bytesToFloat(realPrecBytes);
+	//put the realPrecision in float* pwrErBound
+	pwrErrBound[j++] = approxPrecision;
+	//put the two bytes in pwrErrBoundBytes
+	pwrErrBoundBytes[k++] = realPrecBytes[0];
+	pwrErrBoundBytes[k++] = realPrecBytes[1];
+}
+
+unsigned int optimize_intervals_float_1D_pwr(float *oriData, size_t dataLength, float* pwrErrBound)
+{	
+	size_t i = 0, j = 0;
+	float realPrecision = pwrErrBound[j++];	
+	unsigned long radiusIndex;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	int totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->segment_size==0)
+			realPrecision = pwrErrBound[j++];
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+void compute_segment_precisions_float_2D(float *oriData, float* pwrErrBound, 
+size_t r1, size_t r2, size_t R2, size_t edgeSize, unsigned char* pwrErrBoundBytes, float Min, float Max, double globalPrecision)
+{
+	size_t i = 0, j = 0, k = 0, p = 0, index = 0, J = 0; //I=-1,J=-1 if they are needed
+	float realPrecision; 
+	float approxPrecision;
+	unsigned char realPrecBytes[4];
+	float curValue, curAbsValue;
+	float* statAbsValues = (float*)malloc(R2*sizeof(float));
+	
+	float max = fabs(Min)<fabs(Max)?fabs(Max):fabs(Min); //get the max abs value.
+	float min = fabs(Min)<fabs(Max)?fabs(Min):fabs(Max);
+	for(i=0;i<R2;i++)
+	{
+		if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+			statAbsValues[i] = max;
+		else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+			statAbsValues[i] = min;
+		else
+			statAbsValues[i] = 0; //for SZ_PWR_AVG_TYPE
+	}
+	for(i=0;i<r1;i++)
+	{
+		for(j=0;j<r2;j++)
+		{
+			index = i*r2+j;
+			curValue = oriData[index];				
+			if(((i%edgeSize==edgeSize-1 || i==r1-1) &&j%edgeSize==0&&j>0) || (i%edgeSize==0&&j==0&&i>0))
+			{
+				if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+				{
+					int a = edgeSize, b = edgeSize;
+					if(j==0)
+					{
+						if(r2%edgeSize==0) 
+							b = edgeSize;
+						else
+							b = r2%edgeSize;
+					}
+					if(i==r1-1)
+					{
+						if(r1%edgeSize==0)
+							a = edgeSize;
+						else
+							a = r1%edgeSize;
+					}
+					realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J]/(a*b);
+				}
+				else
+					realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J];
+
+				if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+					realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+				else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+					realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;
+					
+				floatToBytes(realPrecBytes, realPrecision);
+				realPrecBytes[2] = realPrecBytes[3] = 0;
+				approxPrecision = bytesToFloat(realPrecBytes);
+				//put the realPrecision in float* pwrErBound		
+				pwrErrBound[p++] = approxPrecision;
+				//put the two bytes in pwrErrBoundBytes
+				pwrErrBoundBytes[k++] = realPrecBytes[0];
+				pwrErrBoundBytes[k++] = realPrecBytes[1];	
+				
+				if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+					statAbsValues[J] = max;
+				else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+					statAbsValues[J] = min;
+				else
+					statAbsValues[J] = 0; //for SZ_PWR_AVG_TYPE	
+			}	
+			if(j==0)
+				J = 0;
+			else if(j%edgeSize==0)
+				J++;			
+			if(curValue!=0)
+			{
+				curAbsValue = fabs(curValue);
+				
+				switch(confparams_cpr->pwr_type)
+				{
+				case SZ_PWR_MIN_TYPE: 
+					if(statAbsValues[J]>curAbsValue)
+						statAbsValues[J] = curAbsValue;	
+					break;
+				case SZ_PWR_AVG_TYPE:
+					statAbsValues[J] += curAbsValue;
+					break;
+				case SZ_PWR_MAX_TYPE:
+					if(statAbsValues[J]<curAbsValue)
+						statAbsValues[J] = curAbsValue;					
+					break;
+				}
+			}
+		}
+	}
+		
+	if(confparams_cpr->pwr_type==SZ_PWR_AVG_TYPE)
+	{
+		int a = edgeSize, b = edgeSize;
+		if(r2%edgeSize==0) 
+			b = edgeSize;
+		else
+			b = r2%edgeSize;
+		if(r1%edgeSize==0)
+			a = edgeSize;
+		else
+			a = r1%edgeSize;
+		realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J]/(a*b);
+	}
+	else
+		realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J];		
+
+	if(confparams_cpr->errorBoundMode==ABS_AND_PW_REL||confparams_cpr->errorBoundMode==REL_AND_PW_REL)
+		realPrecision = realPrecision<globalPrecision?realPrecision:globalPrecision; 
+	else if(confparams_cpr->errorBoundMode==ABS_OR_PW_REL||confparams_cpr->errorBoundMode==REL_OR_PW_REL)
+		realPrecision = realPrecision<globalPrecision?globalPrecision:realPrecision;
+		
+	floatToBytes(realPrecBytes, realPrecision);
+	realPrecBytes[2] = realPrecBytes[3] = 0;
+	approxPrecision = bytesToFloat(realPrecBytes);
+	//put the realPrecision in float* pwrErBound
+	pwrErrBound[p++] = approxPrecision;
+	//put the two bytes in pwrErrBoundBytes
+	pwrErrBoundBytes[k++] = realPrecBytes[0];
+	pwrErrBoundBytes[k++] = realPrecBytes[1];	
+	
+	free(statAbsValues);
+}
+
+unsigned int optimize_intervals_float_2D_pwr(float *oriData, size_t r1, size_t r2, size_t R2, size_t edgeSize, float* pwrErrBound)
+{	
+	size_t i = 0,j = 0, index, I=0, J=0;
+	float realPrecision = pwrErrBound[0];	
+	unsigned long radiusIndex;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	size_t ir2;
+	for(i=1;i<r1;i++)
+	{
+		ir2 = i*r2;
+		if(i%edgeSize==0)
+		{	
+			I++;
+			J = 0;
+		}
+		for(j=1;j<r2;j++)
+		{
+			index = ir2+j;
+			if(j%edgeSize==0)
+				J++;
+				
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				realPrecision = pwrErrBound[I*R2+J];
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = fabs(pred_value - oriData[index]);
+				radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+void compute_segment_precisions_float_3D(float *oriData, float* pwrErrBound, 
+size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, unsigned char* pwrErrBoundBytes, float Min, float Max, double globalPrecision)
+{
+	size_t i = 0, j = 0, k = 0, p = 0, q = 0, index = 0, J = 0, K = 0; //I=-1,J=-1 if they are needed
+	size_t r23 = r2*r3, ir, jr;
+	float realPrecision; 
+	float approxPrecision;
+	unsigned char realPrecBytes[4];
+	float curValue, curAbsValue;
+	
+	float** statAbsValues = create2DArray_float(R2, R3);
+	float max = fabs(Min)<fabs(Max)?fabs(Max):fabs(Min); //get the max abs value.	
+	float min = fabs(Min)<fabs(Max)?fabs(Min):fabs(Max);
+	
+	for(i=0;i<R2;i++)
+		for(j=0;j<R3;j++)
+		{
+			if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+				statAbsValues[i][j] = max;
+			else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+				statAbsValues[i][j] = min;
+			else
+				statAbsValues[i][j] = 0;
+		}
+	for(i=0;i<r1;i++)
+	{
+		ir = i*r23;		
+		if(i%edgeSize==0&&i>0)
+		{
+			realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+			floatToBytes(realPrecBytes, realPrecision);
+			memset(&realPrecBytes[2], 0, 2);
+			approxPrecision = bytesToFloat(realPrecBytes);
+			//put the realPrecision in float* pwrErBound
+			pwrErrBound[p++] = approxPrecision;
+			//put the two bytes in pwrErrBoundBytes
+			//printf("q=%d, i=%d, j=%d, k=%d\n",q,i,j,k);
+			pwrErrBoundBytes[q++] = realPrecBytes[0];
+			pwrErrBoundBytes[q++] = realPrecBytes[1];
+			if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+				statAbsValues[J][K] = max;
+			else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+				statAbsValues[J][K] = min;
+			
+		}		
+		for(j=0;j<r2;j++)
+		{
+			jr = j*r3;
+			if((i%edgeSize==edgeSize-1 || i == r1-1)&&j%edgeSize==0&&j>0)
+			{
+				realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+				floatToBytes(realPrecBytes, realPrecision);
+				memset(&realPrecBytes[2], 0, 2);
+				approxPrecision = bytesToFloat(realPrecBytes);
+				//put the realPrecision in float* pwrErBound
+				pwrErrBound[p++] = approxPrecision;
+				//put the two bytes in pwrErrBoundBytes
+				//printf("q=%d, i=%d, j=%d, k=%d\n",q,i,j,k);
+				pwrErrBoundBytes[q++] = realPrecBytes[0];
+				pwrErrBoundBytes[q++] = realPrecBytes[1];
+				if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+					statAbsValues[J][K] = max;
+				else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+					statAbsValues[J][K] = min;			
+			}
+			
+			if(j==0)
+				J = 0;
+			else if(j%edgeSize==0)
+				J++;					
+			
+			for(k=0;k<r3;k++)
+			{
+				index = ir+jr+k;				
+				curValue = oriData[index];				
+				if((i%edgeSize==edgeSize-1 || i == r1-1)&&(j%edgeSize==edgeSize-1||j==r2-1)&&k%edgeSize==0&&k>0)
+				{
+					realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+					floatToBytes(realPrecBytes, realPrecision);
+					memset(&realPrecBytes[2], 0, 2);
+					approxPrecision = bytesToFloat(realPrecBytes);
+					//put the realPrecision in float* pwrErBound
+					pwrErrBound[p++] = approxPrecision;
+					//put the two bytes in pwrErrBoundBytes
+					//printf("q=%d, i=%d, j=%d, k=%d\n",q,i,j,k);
+					pwrErrBoundBytes[q++] = realPrecBytes[0];
+					pwrErrBoundBytes[q++] = realPrecBytes[1];
+					
+					if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+						statAbsValues[J][K] = max;
+					else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+						statAbsValues[J][K] = min;	
+				}	
+
+				if(k==0)
+					K = 0;
+				else if(k%edgeSize==0)
+					K++;
+					
+				if(curValue!=0)
+				{
+					curAbsValue = fabs(curValue);
+					if(confparams_cpr->pwr_type == SZ_PWR_MIN_TYPE)
+					{
+						if(statAbsValues[J][K]>curAbsValue)
+						{
+							statAbsValues[J][K] = curAbsValue;
+						}
+					}
+					else if(confparams_cpr->pwr_type == SZ_PWR_MAX_TYPE)
+					{
+						if(statAbsValues[J][K]<curAbsValue)
+						{
+							statAbsValues[J][K] = curAbsValue;
+						}
+					}
+				}
+			}			
+		}
+	}	
+	
+	realPrecision = confparams_cpr->pw_relBoundRatio*statAbsValues[J][K];
+	floatToBytes(realPrecBytes, realPrecision);
+	realPrecBytes[2] = realPrecBytes[3] = 0;
+	approxPrecision = bytesToFloat(realPrecBytes);
+	//put the realPrecision in float* pwrErBound
+	pwrErrBound[p++] = approxPrecision;
+	//put the two bytes in pwrErrBoundBytes
+	pwrErrBoundBytes[q++] = realPrecBytes[0];
+	pwrErrBoundBytes[q++] = realPrecBytes[1];
+	
+	free2DArray_float(statAbsValues, R2);
+}
+
+unsigned int optimize_intervals_float_3D_pwr(float *oriData, size_t r1, size_t r2, size_t r3, size_t R2, size_t R3, size_t edgeSize, float* pwrErrBound)
+{	
+	size_t i,j,k, ir,jr,index, I = 0,J=0,K=0;
+	float realPrecision = pwrErrBound[0];		
+	unsigned long radiusIndex;
+	size_t r23=r2*r3;
+	size_t R23 = R2*R3;
+	float pred_value = 0, pred_err;
+	int *intervals = (int*)malloc(confparams_cpr->maxRangeRadius*sizeof(int));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(int));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		ir = i*r23;
+		if(i%edgeSize==0)
+		{	
+			I++;
+			J = 0;
+		}
+		for(j=1;j<r2;j++)
+		{
+			jr = j*r3;
+			if(j%edgeSize==0)
+			{	
+				J++;
+				K = 0;
+			}			
+			for(k=1;k<r3;k++)
+			{
+				index = ir+jr+k;
+				if(k%edgeSize==0)
+					K++;		
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					realPrecision = pwrErrBound[I*R23+J*R2+K];					
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = fabs(pred_value - oriData[index]);
+					radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, 
+size_t dataLength, size_t *outSize, float min, float max)
+{
+	size_t pwrLength = dataLength%confparams_cpr->segment_size==0?dataLength/confparams_cpr->segment_size:dataLength/confparams_cpr->segment_size+1;
+	float* pwrErrBound = (float*)malloc(sizeof(float)*pwrLength);
+	size_t pwrErrBoundBytes_size = sizeof(unsigned char)*pwrLength*2;
+	unsigned char* pwrErrBoundBytes = (unsigned char*)malloc(pwrErrBoundBytes_size);
+	
+	compute_segment_precisions_float_1D(oriData, dataLength, pwrErrBound, pwrErrBoundBytes, globalPrecision);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_1D_pwr(oriData, dataLength, pwrErrBound);	
+		updateQuantizationInfo(quantization_intervals);
+	}
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i = 0, j = 0;
+	int reqLength;
+	float realPrecision = pwrErrBound[j++];	
+	float medianValue = 0;
+	float radius = fabs(max)<fabs(min)?fabs(min):fabs(max);
+	short radExpo = getExponent_float(radius);
+	
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	float* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *resiBitLengthArray;
+	new_DBA(&resiBitLengthArray, DynArrayInitLen);
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	
+	unsigned char preDataBytes[4] = {0};
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+	float last3CmprsData[3] = {0};
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+						
+	//add the first data	
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+	//printf("%.30G\n",last3CmprsData[0]);	
+		
+	//add the second data
+	type[1] = 0;
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);			
+	compressSingleFloatValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	listAdd_float(last3CmprsData, vce->data);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius;
+	float curData;
+	float pred;
+	double predAbsErr;
+	checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	double interval = 2*realPrecision;
+	int updateReqLength = 0; //a marker: 1 means already updated
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		if(i%confparams_cpr->segment_size==0)
+		{
+			realPrecision = pwrErrBound[j++];
+			checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+			interval = 2*realPrecision;
+			updateReqLength = 0;
+		}
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = fabs(curData - pred);	
+		if(predAbsErr<checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			listAdd_float(last3CmprsData, pred);			
+			continue;
+		}
+		
+		//unpredictable data processing		
+		if(updateReqLength==0)
+		{
+			computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+			reqBytesLength = reqLength/8;
+			resiBitsLength = reqLength%8;
+			updateReqLength = 1;		
+		}
+		
+		type[i] = 0;
+		addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+		
+		compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+		listAdd_float(last3CmprsData, vce->data);	
+	}//end of for
+		
+//	char* expSegmentsInBytes;
+//	int expSegmentsInBytes_size = convertESCToBytes(esc, &expSegmentsInBytes);
+	int exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	new_TightDataPointStorageF2(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitLengthArray->array, resiBitLengthArray->size, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, pwrErrBoundBytes, pwrErrBoundBytes_size, radExpo);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);
+*/
+//	writeUShortData(type, dataLength, "compressStateBytes.sb");
+//	unsigned short type_[dataLength];
+//	SZ_Reset();
+//	decode_withTree(tdps->typeArray, tdps->typeArray_size, type_);	
+//	printf("tdps->typeArray_size=%d\n", tdps->typeArray_size);
+	
+	//free memory
+	free_DBA(resiBitLengthArray);
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+	
+	int floatSize=sizeof(float);
+	if(*outSize>dataLength*floatSize)
+	{
+		size_t k = 0, i;
+		tdps->isLossless = 1;
+		size_t totalByteLength = 3 + exe_params->SZ_SIZE_TYPE + 1 + floatSize*dataLength;
+		*newByteData = (unsigned char*)malloc(totalByteLength);
+		
+		unsigned char dsLengthBytes[exe_params->SZ_SIZE_TYPE];
+		intToBytes_bigEndian(dsLengthBytes, dataLength);//4
+		for (i = 0; i < 3; i++)//3
+			(*newByteData)[k++] = versionNumber[i];
+		
+		if(exe_params->SZ_SIZE_TYPE==4)
+		{
+			(*newByteData)[k++] = 16;	//=00010000	
+		}
+		else 
+		{
+			(*newByteData)[k++] = 80;
+		}
+		for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)//4 or 8
+			(*newByteData)[k++] = dsLengthBytes[i];
+
+		
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+			memcpy((*newByteData)+4+exe_params->SZ_SIZE_TYPE, oriData, dataLength*floatSize);
+		else
+		{
+			unsigned char* p = (*newByteData)+4+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=floatSize)
+				floatToBytes(p, oriData[i]);
+		}
+		*outSize = totalByteLength;
+	}
+
+	free(pwrErrBound);
+	
+	free(vce);
+	free(lce);
+	free_TightDataPointStorageF(tdps);
+	free(exactMidByteArray);
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_2D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, size_t r1, size_t r2, 
+size_t *outSize, float min, float max)
+{
+	size_t dataLength=r1*r2;
+	int blockEdgeSize = computeBlockEdgeSize_2D(confparams_cpr->segment_size);
+	size_t R1 = 1+(r1-1)/blockEdgeSize;
+	size_t R2 = 1+(r2-1)/blockEdgeSize;
+	float* pwrErrBound = (float*)malloc(sizeof(float)*R1*R2);
+	size_t pwrErrBoundBytes_size = sizeof(unsigned char)*R1*R2*2;
+	unsigned char* pwrErrBoundBytes = (unsigned char*)malloc(pwrErrBoundBytes_size);
+	
+	compute_segment_precisions_float_2D(oriData, pwrErrBound, r1, r2, R2, blockEdgeSize, pwrErrBoundBytes, min, max, globalPrecision);
+		
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{	
+		quantization_intervals = optimize_intervals_float_2D_pwr(oriData, r1, r2, R2, blockEdgeSize, pwrErrBound);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	//printf("quantization_intervals=%d\n",quantization_intervals);
+	
+	size_t i=0,j=0,I=0,J=0; 
+	int reqLength;
+	float realPrecision = pwrErrBound[I*R2+J];	
+	float pred1D, pred2D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+	
+	P0 = (float*)malloc(r2*sizeof(float));
+	memset(P0, 0, r2*sizeof(float));
+	P1 = (float*)malloc(r2*sizeof(float));
+	memset(P1, 0, r2*sizeof(float));
+		
+	float medianValue = 0;
+	float radius = fabs(max)<fabs(min)?fabs(min):fabs(max);	
+	short radExpo = getExponent_float(radius);
+	int updateReqLength = 1;
+	
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	float* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *resiBitLengthArray;
+	new_DBA(&resiBitLengthArray, DynArrayInitLen);
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	type[0] = 0;
+	
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+			
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{		
+		type[1] = 0;
+
+		addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+		compressSingleFloatValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		if(j%blockEdgeSize==0)
+		{
+			J++;
+			realPrecision = pwrErrBound[I*R2+J];
+			updateReqLength = 0;
+		}
+
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}
+
+			type[j] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleFloatValue(vce, spaceFillingValue[j], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		J = 0;
+		if(i%blockEdgeSize==0)
+			I++;
+		realPrecision = pwrErrBound[I*R2+J]; //J==0
+		updateReqLength = 0;
+		
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}
+			
+			type[index] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			
+			if(j%blockEdgeSize==0)
+			{
+				J++;
+				realPrecision = pwrErrBound[I*R2+J];
+				updateReqLength = 0;
+			}
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}
+
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	int exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	new_TightDataPointStorageF2(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitLengthArray->array, resiBitLengthArray->size, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, pwrErrBoundBytes, pwrErrBoundBytes_size, radExpo);
+	
+	//free memory
+	free_DBA(resiBitLengthArray);
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+	
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+	
+	free(pwrErrBound);
+
+	free(vce);
+	free(lce);
+	free_TightDataPointStorageF(tdps);	
+	free(exactMidByteArray);
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_3D_pwr(unsigned char** newByteData, float *oriData, double globalPrecision, 
+size_t r1, size_t r2, size_t r3, size_t *outSize, float min, float max)
+{
+	size_t dataLength=r1*r2*r3;
+	
+	int blockEdgeSize = computeBlockEdgeSize_3D(confparams_cpr->segment_size);
+	size_t R1 = 1+(r1-1)/blockEdgeSize;
+	size_t R2 = 1+(r2-1)/blockEdgeSize;
+	size_t R3 = 1+(r3-1)/blockEdgeSize;
+	float* pwrErrBound = (float*)malloc(sizeof(float)*R1*R2*R3);
+	size_t pwrErrBoundBytes_size = sizeof(unsigned char)*R1*R2*R3*2;
+	unsigned char* pwrErrBoundBytes = (unsigned char*)malloc(pwrErrBoundBytes_size);	
+	
+	compute_segment_precisions_float_3D(oriData, pwrErrBound, r1, r2, r3, R2, R3, blockEdgeSize, pwrErrBoundBytes, min, max, globalPrecision);	
+
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_float_3D_pwr(oriData, r1, r2, r3, R2, R3, blockEdgeSize, pwrErrBound);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i=0,j=0,k=0, I = 0, J = 0, K = 0;
+	int reqLength;
+	float realPrecision = pwrErrBound[0];		
+	float pred1D, pred2D, pred3D;
+	float diff = 0.0;
+	double itvNum = 0;
+	float *P0, *P1;
+
+	size_t r23 = r2*r3;
+	size_t R23 = R2*R3;
+	P0 = (float*)malloc(r23*sizeof(float));
+	P1 = (float*)malloc(r23*sizeof(float));
+	float radius = fabs(max)<fabs(min)?fabs(min):fabs(max);
+	float medianValue = 0;
+	short radExpo = getExponent_float(radius);
+	int updateReqLength = 0;
+	
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;realPrecision
+
+	float* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *resiBitLengthArray;
+	new_DBA(&resiBitLengthArray, DynArrayInitLen);
+
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+
+	type[0] = 0;
+
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	type[0] = 0;
+	addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	P1[0] = vce->data;
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = fabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		if(updateReqLength==0)
+		{
+			computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+			reqBytesLength = reqLength/8;
+			resiBitsLength = reqLength%8;
+			updateReqLength = 1;
+		}		
+		
+		type[1] = 0;
+
+		addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+		compressSingleFloatValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		P1[1] = vce->data;
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		if(j%blockEdgeSize==0)
+		{
+			J++;
+			realPrecision = pwrErrBound[J];
+			updateReqLength = 0;
+		}		
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}			
+
+			type[j] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleFloatValue(vce, spaceFillingValue[j], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[j] = vce->data;
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	K = 0;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+
+		J = 0;
+		if(i%blockEdgeSize==0)
+			I++;
+		realPrecision = pwrErrBound[I*R3+J]; //J==0
+		updateReqLength = 0;
+
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}		
+						
+			type[index] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P1[index] = vce->data;
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++) //note that this j refers to fastest dimension (lowest order)
+		{
+			index = i*r3+j;		
+			if(j%blockEdgeSize==0)
+			{
+				J++;
+				realPrecision = pwrErrBound[I*R3+J];
+				updateReqLength = 0;
+			}			
+		
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}						
+				
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P1[index] = vce->data;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;			
+		I = 0;
+		J = 0;
+		if(k%blockEdgeSize==0)
+			K++;
+		realPrecision = pwrErrBound[K*R23]; //J==0
+		updateReqLength = 0;
+		
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = fabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;
+				updateReqLength = 1;
+			}					
+			
+			type[index] = 0;
+
+			addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+			compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			P0[0] = vce->data;
+		}
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			index = k*r23+j;	
+
+			if(j%blockEdgeSize==0)
+			{
+				J++;
+				realPrecision = pwrErrBound[K*R23+J];
+				updateReqLength = 0;			
+			}					
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+/*				if(type[index]==0)
+					printf("err:type[%d]=0, index4\n", index);					*/
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}						
+				
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[j] = vce->data;
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			J = 0;
+			if(i%blockEdgeSize==0)
+				I++;
+			realPrecision = pwrErrBound[K*R23+I*R3+J]; //J==0
+			updateReqLength = 0;			
+			
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = fabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;
+					updateReqLength = 1;
+				}						
+				
+				type[index] = 0;
+
+				addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+				compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+				P0[index2D] = vce->data;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+				index = k*r23 + i*r3 + j;
+				if(j%blockEdgeSize==0)
+				{
+					J++;
+					realPrecision = pwrErrBound[K*R23+I*R3+J];
+					updateReqLength = 0;			
+				}							
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = fabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					if(updateReqLength==0)
+					{
+						computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+						reqBytesLength = reqLength/8;
+						resiBitsLength = reqLength%8;
+						updateReqLength = 1;
+					}							
+					
+					type[index] = 0;
+
+					addDBA_Data(resiBitLengthArray, (unsigned char)resiBitsLength);
+					compressSingleFloatValue(vce, spaceFillingValue[index], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+					updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+					memcpy(preDataBytes,vce->curBytes,4);
+					addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+					P0[index2D] = vce->data;
+				}
+			}
+		}
+
+		float *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+	int exactDataNum = exactLeadNumArray->size;
+
+	TightDataPointStorageF* tdps;
+
+	new_TightDataPointStorageF2(&tdps, dataLength, exactDataNum,
+			type, exactMidByteArray->array, exactMidByteArray->size,
+			exactLeadNumArray->array,
+			resiBitArray->array, resiBitArray->size,
+			resiBitLengthArray->array, resiBitLengthArray->size, 
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, pwrErrBoundBytes, pwrErrBoundBytes_size, radExpo);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);
+*/
+
+	convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+
+	//free memory
+	free_DBA(resiBitLengthArray);
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);
+
+
+	free(pwrErrBound);
+
+	free(vce);
+	free(lce);
+	free_TightDataPointStorageF(tdps);
+	free(exactMidByteArray);
+}
+
+void createRangeGroups_float(float** posGroups, float** negGroups, int** posFlags, int** negFlags)
+{
+	size_t size = GROUP_COUNT*sizeof(float);
+	size_t size2 = GROUP_COUNT*sizeof(int);
+	*posGroups = (float*)malloc(size);
+	*negGroups = (float*)malloc(size);
+	*posFlags = (int*)malloc(size2);
+	*negFlags = (int*)malloc(size2);
+	memset(*posGroups, 0, size);
+	memset(*negGroups, 0, size);
+	memset(*posFlags, 0, size2);
+	memset(*negFlags, 0, size2);
+}
+
+void compressGroupIDArray_float(char* groupID, TightDataPointStorageF* tdps)
+{
+	size_t dataLength = tdps->dataSeriesLength;
+	int* standGroupID = (int*)malloc(dataLength*sizeof(int));
+
+	size_t i;
+	standGroupID[0] = groupID[0]+GROUP_COUNT; //plus an offset such that it would not be a negative number.
+	char lastGroupIDValue = groupID[0], curGroupIDValue;
+	int offset = 2*(GROUP_COUNT + 2);
+	for(i=1; i<dataLength;i++)
+	{
+		curGroupIDValue = groupID[i];
+		standGroupID[i] = (curGroupIDValue - lastGroupIDValue) + offset; 
+		lastGroupIDValue = curGroupIDValue;
+	}
+	
+	unsigned char* out = NULL;
+	size_t outSize;
+	
+	HuffmanTree* huffmanTree = SZ_Reset();
+	encode_withTree(huffmanTree, standGroupID, dataLength, &out, &outSize);
+	SZ_ReleaseHuffman(huffmanTree);
+	
+	tdps->pwrErrBoundBytes = out; //groupIDArray
+	tdps->pwrErrBoundBytes_size = outSize;
+	
+	free(standGroupID);
+}
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_pwrGroup(float* oriData, size_t dataLength, int errBoundMode, 
+double absErrBound, double relBoundRatio, double pwrErrRatio, float valueRangeSize, float medianValue_f)
+{
+	size_t i;
+	float *posGroups, *negGroups, *groups;
+	float pos_01_group = 0, neg_01_group = 0; //[0,1] and [-1,0]
+	int *posFlags, *negFlags, *flags;
+	int pos_01_flag = 0, neg_01_flag = 0;
+	createRangeGroups_float(&posGroups, &negGroups, &posFlags, &negFlags);
+	size_t nbBins = (size_t)(1/pwrErrRatio);
+	if(nbBins%2==1)
+		nbBins++;
+	exe_params->intvRadius = nbBins;
+
+	int reqLength, status;
+	float medianValue = medianValue_f;
+	float realPrecision = (float)getRealPrecision_float(valueRangeSize, errBoundMode, absErrBound, relBoundRatio, &status);
+	if(realPrecision<0)
+		realPrecision = pwrErrRatio;
+	float realGroupPrecision; //precision (error) based on group ID
+	getPrecisionReqLength_float(realPrecision);
+	short radExpo = getExponent_float(valueRangeSize/2);
+	short lastGroupNum = 0, groupNum, grpNum = 0;
+	
+	double* groupErrorBounds = generateGroupErrBounds(errBoundMode, realPrecision, pwrErrRatio);
+	exe_params->intvRadius = generateGroupMaxIntervalCount(groupErrorBounds);
+	
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	char *groupID = (char*) malloc(dataLength*sizeof(char));
+	char *gp = groupID;
+		
+	float* spaceFillingValue = oriData; 
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+			
+	int state;
+	float curData, decValue;
+	float pred;
+	float predAbsErr;
+	double interval = 0;
+	
+	//add the first data	
+	type[0] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	
+	curData = spaceFillingValue[0];
+	groupNum = computeGroupNum_float(vce->data);
+
+	if(curData > 0 && groupNum >= 0)
+	{
+		groups = posGroups;
+		flags = posFlags;
+		grpNum = groupNum;
+	}
+	else if(curData < 0 && groupNum >= 0)
+	{
+		groups = negGroups;
+		flags = negFlags;
+		grpNum = groupNum;
+	}
+	else if(curData >= 0 && groupNum == -1)
+	{
+		groups = &pos_01_group;
+		flags = &pos_01_flag;
+		grpNum = 0;
+	}
+	else //curData < 0 && groupNum == -1
+	{
+		groups = &neg_01_group;
+		flags = &neg_01_flag;
+		grpNum = 0;
+	}
+
+	listAdd_float_group(groups, flags, groupNum, spaceFillingValue[0], vce->data, gp);
+	gp++;
+	
+	for(i=1;i<dataLength;i++)
+	{
+		curData = oriData[i];
+		//printf("i=%d, posGroups[3]=%f, negGroups[3]=%f\n", i, posGroups[3], negGroups[3]);
+		
+		groupNum = computeGroupNum_float(curData);
+		
+		if(curData > 0 && groupNum >= 0)
+		{
+			groups = posGroups;
+			flags = posFlags;
+			grpNum = groupNum;
+		}
+		else if(curData < 0 && groupNum >= 0)
+		{
+			groups = negGroups;
+			flags = negFlags;
+			grpNum = groupNum;
+		}
+		else if(curData >= 0 && groupNum == -1)
+		{
+			groups = &pos_01_group;
+			flags = &pos_01_flag;
+			grpNum = 0;
+		}
+		else //curData < 0 && groupNum == -1
+		{
+			groups = &neg_01_group;
+			flags = &neg_01_flag;
+			grpNum = 0;
+		}
+
+		if(groupNum>=GROUP_COUNT)
+		{
+			type[i] = 0;
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			listAdd_float_group(groups, flags, lastGroupNum, curData, vce->data, gp);	//set the group number to be last one in order to get the groupID array as smooth as possible.		
+		}
+		else if(flags[grpNum]==0) //the dec value may not be in the same group
+		{	
+			type[i] = 0;
+			compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+			updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+			memcpy(preDataBytes,vce->curBytes,4);
+			addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+			//decGroupNum = computeGroupNum_float(vce->data);
+			
+			//if(decGroupNum < groupNum)
+			//	decValue = curData>0?pow(2, groupNum):-pow(2, groupNum);
+			//else if(decGroupNum > groupNum)
+			//	decValue = curData>0?pow(2, groupNum+1):-pow(2, groupNum+1);
+			//else
+			//	decValue = vce->data;
+			
+			decValue = vce->data;	
+			listAdd_float_group(groups, flags, groupNum, curData, decValue, gp);
+			lastGroupNum = curData>0?groupNum + 2: -(groupNum+2);
+		}
+		else //if flags[groupNum]==1, the dec value must be in the same group
+		{
+			pred = groups[grpNum];
+			predAbsErr = fabs(curData - pred);
+			realGroupPrecision = groupErrorBounds[grpNum]; //compute real error bound
+			interval = realGroupPrecision*2;
+			state = (predAbsErr/realGroupPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				decValue = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				decValue = pred - state*interval;
+			}
+			//decGroupNum = computeGroupNum_float(pred);
+			
+			if((decValue>0&&curData<0)||(decValue<0&&curData>=0))
+				decValue = 0;
+			//else
+			//{
+			//	if(decGroupNum < groupNum)
+			//		decValue = curData>0?pow(2, groupNum):-pow(2, groupNum);
+			//	else if(decGroupNum > groupNum)
+			//		decValue = curData>0?pow(2, groupNum+1):-pow(2, groupNum+1);
+			//	else
+			//		decValue = pred;				
+			//}
+			
+			if(fabs(curData-decValue)>realGroupPrecision)
+			{	
+				type[i] = 0;
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+
+				decValue = vce->data;	
+			}
+			
+			listAdd_float_group(groups, flags, groupNum, curData, decValue, gp);			
+			lastGroupNum = curData>=0?groupNum + 2: -(groupNum+2);			
+		}
+		gp++;	
+
+	}
+	
+	int exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	//combineTypeAndGroupIDArray(nbBins, dataLength, &type, groupID);
+
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength, 
+			realPrecision, medianValue, (char)reqLength, nbBins, NULL, 0, radExpo);	
+	
+	compressGroupIDArray_float(groupID, tdps);
+	
+	free(posGroups);
+	free(negGroups);
+	free(posFlags);
+	free(negFlags);
+	free(groupID);
+	free(groupErrorBounds);
+	
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);	
+	
+	return tdps;
+}
+
+void SZ_compress_args_float_NoCkRngeNoGzip_1D_pwrgroup(unsigned char** newByteData, float *oriData,
+size_t dataLength, double absErrBound, double relBoundRatio, double pwrErrRatio, float valueRangeSize, float medianValue_f, size_t *outSize)
+{
+        TightDataPointStorageF* tdps = SZ_compress_float_1D_MDQ_pwrGroup(oriData, dataLength, confparams_cpr->errorBoundMode, 
+        absErrBound, relBoundRatio, pwrErrRatio, 
+        valueRangeSize, medianValue_f);
+
+        convertTDPStoFlatBytes_float(tdps, newByteData, outSize);
+
+        if(*outSize>dataLength*sizeof(float))
+                SZ_compress_args_float_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+
+        free_TightDataPointStorageF(tdps);
+}
diff --git a/thirdparty/SZ/sz/src/sz_float_ts.c b/thirdparty/SZ/sz/src/sz_float_ts.c
new file mode 100644
index 0000000000000000000000000000000000000000..2c485c84da67ae92c9a475a00ba7f65f94ca9006
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_float_ts.c
@@ -0,0 +1,206 @@
+/**
+ *  @file sz_float.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief SZ_Init, Compression and Decompression functions
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "TightDataPointStorageF.h"
+#include "zlib.h"
+#include "rw.h"
+#include "sz_float_ts.h"
+
+unsigned int optimize_intervals_float_1D_ts(float *oriData, size_t dataLength, float* preData, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	float pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			pred_value = preData[i];
+			pred_err = fabs(pred_value - oriData[i]);
+			radiusIndex = (unsigned long)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageF* SZ_compress_float_1D_MDQ_ts(float *oriData, size_t dataLength, sz_multisteps* multisteps,
+double realPrecision, float valueRangeSize, float medianValue_f)
+{
+	float* preStepData = (float*)(multisteps->hist_data);
+
+	//store the decompressed data
+	float* decData = (float*)malloc(sizeof(float)*dataLength);
+	memset(decData, 0, sizeof(float)*dataLength);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_float_1D_ts(oriData, dataLength, preStepData, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+
+	size_t i;
+	int reqLength;
+	float medianValue = medianValue_f;
+	short radExpo = getExponent_float(valueRangeSize/2);
+	
+	computeReqLength_float(realPrecision, radExpo, &reqLength, &medianValue);	
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	float* spaceFillingValue = oriData; //
+	
+	DynamicIntArray *exactLeadNumArray;
+	new_DIA(&exactLeadNumArray, DynArrayInitLen);
+	
+	DynamicByteArray *exactMidByteArray;
+	new_DBA(&exactMidByteArray, DynArrayInitLen);
+	
+	DynamicIntArray *resiBitArray;
+	new_DIA(&resiBitArray, DynArrayInitLen);
+	
+	unsigned char preDataBytes[4];
+	intToBytes_bigEndian(preDataBytes, 0);
+	
+	int reqBytesLength = reqLength/8;
+	int resiBitsLength = reqLength%8;
+
+	FloatValueCompressElement *vce = (FloatValueCompressElement*)malloc(sizeof(FloatValueCompressElement));
+	LossyCompressionElement *lce = (LossyCompressionElement*)malloc(sizeof(LossyCompressionElement));
+				
+	//add the first data	
+	type[0] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[0], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	decData[0] = vce->data;
+		
+	//add the second data
+	type[1] = 0;
+	compressSingleFloatValue(vce, spaceFillingValue[1], realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+	updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+	memcpy(preDataBytes,vce->curBytes,4);
+	addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+	decData[1] = vce->data;	
+	
+	int state = 0;
+	double checkRadius = 0;
+	float curData = 0;
+	float pred = 0;
+	float predAbsErr = 0;
+	checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		pred = preStepData[i];
+		predAbsErr = fabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+				
+			//double-check the prediction error in case of machine-epsilon impact	
+			if(fabs(curData-pred)>realPrecision)
+			{	
+				type[i] = 0;				
+				compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+				updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+				memcpy(preDataBytes,vce->curBytes,4);
+				addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);		
+				decData[i] = vce->data;
+			}
+			else
+			{
+				decData[i] = pred;
+			}
+			
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;		
+		compressSingleFloatValue(vce, curData, realPrecision, medianValue, reqLength, reqBytesLength, resiBitsLength);
+		updateLossyCompElement_Float(vce->curBytes, preDataBytes, reqBytesLength, resiBitsLength, lce);
+		memcpy(preDataBytes,vce->curBytes,4);
+		addExactData(exactMidByteArray, exactLeadNumArray, resiBitArray, lce);
+		decData[i] = vce->data;
+	}//end of for
+		
+	size_t exactDataNum = exactLeadNumArray->size;
+	
+	TightDataPointStorageF* tdps;
+			
+	new_TightDataPointStorageF(&tdps, dataLength, exactDataNum, 
+			type, exactMidByteArray->array, exactMidByteArray->size,  
+			exactLeadNumArray->array,  
+			resiBitArray->array, resiBitArray->size, 
+			resiBitsLength,
+			realPrecision, medianValue, (char)reqLength, quantization_intervals, NULL, 0, 0);
+
+	//free memory
+	free_DIA(exactLeadNumArray);
+	free_DIA(resiBitArray);
+	free(type);	
+	free(vce);
+	free(lce);	
+	free(exactMidByteArray); //exactMidByteArray->array has been released in free_TightDataPointStorageF(tdps);
+		
+	memcpy(preStepData, decData, dataLength*sizeof(float)); //update the data
+	free(decData);
+	
+	return tdps;
+}
+
+
diff --git a/thirdparty/SZ/sz/src/sz_int16.c b/thirdparty/SZ/sz/src/sz_int16.c
new file mode 100644
index 0000000000000000000000000000000000000000..fc91dd1a6f64af0d19c14540be9ecf57bf4bb443
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_int16.c
@@ -0,0 +1,1383 @@
+/**
+ *  @file sz_int16.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_int16, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_int16.h"
+
+unsigned int optimize_intervals_int16_1D(int16_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int16_2D(int16_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int16_3D(int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_int16_4D(int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_int16_1D_MDQ(int16_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_int16_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	int16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressInt16Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressInt16Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int64_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			if(pred>SZ_INT16_MAX) pred = SZ_INT16_MAX;
+			if(pred<SZ_INT16_MIN) pred = SZ_INT16_MIN;			
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressInt16Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT16);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_int16_StoreOriData(int16_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(int16_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;		
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			int16ToBytes_bigEndian(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_int16_NoCkRngeNoGzip_1D(unsigned char** newByteData, int16_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int16_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int16_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(int16_t))
+		SZ_compress_args_int16_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_int16_2D_MDQ(int16_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int16_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	int16_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (int16_t*)malloc(r2*sizeof(int16_t));
+	memset(P0, 0, r2*sizeof(int16_t));
+	P1 = (int16_t*)malloc(r2*sizeof(int16_t));
+	memset(P1, 0, r2*sizeof(int16_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	int16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressInt16Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_INT16_MIN)
+			P1[1] = SZ_INT16_MIN;
+		else
+			P1[1] = SZ_INT16_MAX;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt16Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				P1[j] = SZ_INT16_MIN;
+			else
+				P1[j] = SZ_INT16_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				P0[0] = SZ_INT16_MIN;
+			else
+				P0[0] = SZ_INT16_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P0[j] = SZ_INT16_MIN;
+				else
+					P0[j] = SZ_INT16_MAX;						
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		int16_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT16);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_int16_NoCkRngeNoGzip_2D(unsigned char** newByteData, int16_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int16_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int16_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(int16_t))
+		SZ_compress_args_int16_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_int16_3D_MDQ(int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int16_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	int16_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (int16_t*)malloc(r23*sizeof(int16_t));
+	P1 = (int16_t*)malloc(r23*sizeof(int16_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressInt16Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_INT16_MIN)
+			P1[1] = SZ_INT16_MIN;
+		else
+			P1[1] = SZ_INT16_MAX;		
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt16Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				P1[j] = SZ_INT16_MIN;
+			else
+				P1[j] = SZ_INT16_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				P1[index] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				P1[index] = SZ_INT16_MIN;
+			else
+				P1[index] = SZ_INT16_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P1[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P1[index] = SZ_INT16_MIN;
+				else
+					P1[index] = SZ_INT16_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				P0[0] = SZ_INT16_MIN;
+			else
+				P0[0] = SZ_INT16_MAX;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P0[j] = SZ_INT16_MIN;
+				else
+					P0[j] = SZ_INT16_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P0[index2D] = SZ_INT16_MIN;
+				else
+					P0[index2D] = SZ_INT16_MAX;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						P0[index2D] = SZ_INT16_MIN;
+					else
+						P0[index2D] = SZ_INT16_MAX;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		int16_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT16);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_int16_NoCkRngeNoGzip_3D(unsigned char** newByteData, int16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int16_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(int16_t))
+		SZ_compress_args_int16_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_int16_4D_MDQ(int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int16_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	int16_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (int16_t*)malloc(r34*sizeof(int16_t));
+	P1 = (int16_t*)malloc(r34*sizeof(int16_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressInt16Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				P1[index2D] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				P1[index2D] = SZ_INT16_MIN;
+			else
+				P1[index2D] = SZ_INT16_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P1[index2D] = SZ_INT16_MIN;
+				else
+					P1[index2D] = SZ_INT16_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P1[index2D] = SZ_INT16_MIN;
+				else
+					P1[index2D] = SZ_INT16_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						P1[index2D] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						P1[index2D] = SZ_INT16_MIN;
+					else
+						P1[index2D] = SZ_INT16_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					P0[index2D] = SZ_INT16_MIN;
+				else
+					P0[index2D] = SZ_INT16_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						P0[index2D] = SZ_INT16_MIN;
+					else
+						P0[index2D] = SZ_INT16_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						P0[index2D] = SZ_INT16_MIN;
+					else
+						P0[index2D] = SZ_INT16_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+							P0[index2D] = tmp;
+						else if(tmp < SZ_INT16_MIN)
+							P0[index2D] = SZ_INT16_MIN;
+						else
+							P0[index2D] = SZ_INT16_MAX;							
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressInt16Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			int16_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT16);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_int16_NoCkRngeNoGzip_4D(unsigned char** newByteData, int16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int16_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(int16_t))
+		SZ_compress_args_int16_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_int16_withinRange(unsigned char** newByteData, int16_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char)*2);
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 2;
+	
+	int16_t value = oriData[0];
+	int16ToBytes_bigEndian(tdps->exactDataBytes, value);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(int16_t)+SZ_SIZE_TYPE; //8==3+1+4(int16_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_int16_wRngeNoGzip(unsigned char** newByteData, int16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	int16_t minValue = computeRangeSize_int(oriData, SZ_INT16, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int16_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_int16_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_int16(unsigned char** newByteData, int16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	int16_t minValue = (int16_t)computeRangeSize_int(oriData, SZ_INT16, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int16_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_int16_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the int16_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/thirdparty/SZ/sz/src/sz_int32.c b/thirdparty/SZ/sz/src/sz_int32.c
new file mode 100644
index 0000000000000000000000000000000000000000..bcd97834e18f1a9844a49549a2b10e73be63d67e
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_int32.c
@@ -0,0 +1,1267 @@
+/**
+ *  @file sz_int32.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_int32, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_int32.h"
+
+unsigned int optimize_intervals_int32_1D(int32_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int32_2D(int32_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = r1*r2/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int32_3D(int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_int32_4D(int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_int32_1D_MDQ(int32_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_int32_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	int32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressInt32Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressInt32Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int32_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+//		if(i==2869438)
+//			printf("i=%d\n", i);
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+/*			if(type[i]==0)
+				printf("err:type[%d]=0\n", i);*/
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressInt32Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT32);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_int32_StoreOriData(int32_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(int32_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;		
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			int32ToBytes_bigEndian(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_int32_NoCkRngeNoGzip_1D(unsigned char** newByteData, int32_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int32_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int32_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(int32_t))
+		SZ_compress_args_int32_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_int32_2D_MDQ(int32_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int32_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int32_t pred1D, pred2D, curValue;
+	int32_t diff = 0.0;
+	double itvNum = 0;
+	int32_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (int32_t*)malloc(r2*sizeof(int32_t));
+	memset(P0, 0, r2*sizeof(int32_t));
+	P1 = (int32_t*)malloc(r2*sizeof(int32_t));
+	memset(P1, 0, r2*sizeof(int32_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	int32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressInt32Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt32Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		int32_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT32);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_int32_NoCkRngeNoGzip_2D(unsigned char** newByteData, int32_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int32_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int32_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(int32_t))
+		SZ_compress_args_int32_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_int32_3D_MDQ(int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int32_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int32_t pred1D, pred2D, pred3D, curValue;
+	int32_t diff = 0.0;
+	double itvNum = 0;
+	int32_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (int32_t*)malloc(r23*sizeof(int32_t));
+	P1 = (int32_t*)malloc(r23*sizeof(int32_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressInt32Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt32Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+/*				if(type[index]==0)
+					printf("err:type[%d]=0, index4\n", index);					*/
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		int32_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT32);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_int32_NoCkRngeNoGzip_3D(unsigned char** newByteData, int32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{	
+	TightDataPointStorageI* tdps = SZ_compress_int32_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(int32_t))
+		SZ_compress_args_int32_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_int32_4D_MDQ(int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int32_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int32_t pred1D, pred2D, pred3D, curValue;
+	int32_t diff = 0.0;
+	double itvNum = 0;
+	int32_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (int32_t*)malloc(r34*sizeof(int32_t));
+	P1 = (int32_t*)malloc(r34*sizeof(int32_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressInt32Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P1[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressInt32Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			int32_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT32);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_int32_NoCkRngeNoGzip_4D(unsigned char** newByteData, int32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int32_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(int32_t))
+		SZ_compress_args_int32_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_int32_withinRange(unsigned char** newByteData, int32_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char)*4);
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 4;
+	
+	int32_t value = oriData[0];
+	int32ToBytes_bigEndian(tdps->exactDataBytes, value);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(int32_t)+SZ_SIZE_TYPE; //8==3+1+4(int32_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_int32_wRngeNoGzip(unsigned char** newByteData, int32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	int32_t minValue = computeRangeSize_int(oriData, SZ_INT32, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int32_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_int32_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_int32(unsigned char** newByteData, int32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	int32_t minValue = (int32_t)computeRangeSize_int(oriData, SZ_INT32, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int32_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_int32_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the int32_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/thirdparty/SZ/sz/src/sz_int64.c b/thirdparty/SZ/sz/src/sz_int64.c
new file mode 100644
index 0000000000000000000000000000000000000000..eb973775aa9cf1565e92797217b96aa102a678ae
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_int64.c
@@ -0,0 +1,1268 @@
+/**
+ *  @file sz_int64.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_int64, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_int64.h"
+
+unsigned int optimize_intervals_int64_1D(int64_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int64_2D(int64_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int64_3D(int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_int64_4D(int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_int64_1D_MDQ(int64_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_int64_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	int64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressInt64Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressInt64Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int64_t pred;
+	int64_t predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+//		if(i==2869438)
+//			printf("i=%d\n", i);
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+/*			if(type[i]==0)
+				printf("err:type[%d]=0\n", i);*/
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressInt64Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT64);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_int64_StoreOriData(int64_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(int64_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;		
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			int64ToBytes_bigEndian(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_int64_NoCkRngeNoGzip_1D(unsigned char** newByteData, int64_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int64_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(int64_t))
+		SZ_compress_args_int64_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_int64_2D_MDQ(int64_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int64_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	int64_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (int64_t*)malloc(r2*sizeof(int64_t));
+	memset(P0, 0, r2*sizeof(int64_t));
+	P1 = (int64_t*)malloc(r2*sizeof(int64_t));
+	memset(P1, 0, r2*sizeof(int64_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	int64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressInt64Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt64Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		int64_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT64);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_int64_NoCkRngeNoGzip_2D(unsigned char** newByteData, int64_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int64_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(int64_t))
+		SZ_compress_args_int64_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_int64_3D_MDQ(int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int64_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	int64_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (int64_t*)malloc(r23*sizeof(int64_t));
+	P1 = (int64_t*)malloc(r23*sizeof(int64_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressInt64Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt64Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+/*				if(type[index]==0)
+					printf("err:type[%d]=0, index4\n", index);					*/
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		int64_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT64);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_int64_NoCkRngeNoGzip_3D(unsigned char** newByteData, int64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int64_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(int64_t))
+		SZ_compress_args_int64_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_int64_4D_MDQ(int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int64_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	int64_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (int64_t*)malloc(r34*sizeof(int64_t));
+	P1 = (int64_t*)malloc(r34*sizeof(int64_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressInt64Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P1[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressInt64Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			int64_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT64);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_int64_NoCkRngeNoGzip_4D(unsigned char** newByteData, int64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int64_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(int64_t))
+		SZ_compress_args_int64_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_int64_withinRange(unsigned char** newByteData, int64_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char)*8);
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 8;
+	
+	int64_t value = oriData[0];
+	int64ToBytes_bigEndian(tdps->exactDataBytes, value);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(int64_t)+SZ_SIZE_TYPE; //8==3+1+4(int64_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_int64_wRngeNoGzip(unsigned char** newByteData, int64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	int64_t minValue = computeRangeSize_int(oriData, SZ_INT64, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int64_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_int64_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_int64(unsigned char** newByteData, int64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	int64_t minValue = (int64_t)computeRangeSize_int(oriData, SZ_INT64, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int64_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_int64_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the int64_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/thirdparty/SZ/sz/src/sz_int8.c b/thirdparty/SZ/sz/src/sz_int8.c
new file mode 100644
index 0000000000000000000000000000000000000000..c869681022f0bd8872b4dbfb632df616c77aa4b9
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_int8.c
@@ -0,0 +1,1384 @@
+/**
+ *  @file sz_int8.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_int8, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_int8.h"
+
+unsigned int optimize_intervals_int8_1D(int8_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int8_2D(int8_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_int8_3D(int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_int8_4D(int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_int8_1D_MDQ(int8_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_int8_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	int8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressInt8Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressInt8Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int64_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			if(pred>SZ_INT8_MAX) pred = SZ_INT8_MAX;
+			if(pred<SZ_INT8_MIN) pred = SZ_INT8_MIN;			
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressInt8Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT8);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_int8_StoreOriData(int8_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(int8_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+		
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;			
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			*p = oriData[i];
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_int8_NoCkRngeNoGzip_1D(unsigned char** newByteData, int8_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, int8_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int8_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(int8_t))
+		SZ_compress_args_int8_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_int8_2D_MDQ(int8_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int8_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	int8_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (int8_t*)malloc(r2*sizeof(int8_t));
+	memset(P0, 0, r2*sizeof(int8_t));
+	P1 = (int8_t*)malloc(r2*sizeof(int8_t));
+	memset(P1, 0, r2*sizeof(int8_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	int8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressInt8Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_INT8_MIN)
+			P1[1] = SZ_INT8_MIN;
+		else
+			P1[1] = SZ_INT8_MAX;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt8Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				P1[j] = SZ_INT8_MIN;
+			else
+				P1[j] = SZ_INT8_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				P0[0] = SZ_INT8_MIN;
+			else
+				P0[0] = SZ_INT8_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P0[j] = SZ_INT8_MIN;
+				else
+					P0[j] = SZ_INT8_MAX;						
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		int8_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT8);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_int8_NoCkRngeNoGzip_2D(unsigned char** newByteData, int8_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int8_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int8_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(int8_t))
+		SZ_compress_args_int8_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_int8_3D_MDQ(int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int8_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	int8_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (int8_t*)malloc(r23*sizeof(int8_t));
+	P1 = (int8_t*)malloc(r23*sizeof(int8_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressInt8Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_INT8_MIN)
+			P1[1] = SZ_INT8_MIN;
+		else
+			P1[1] = SZ_INT8_MAX;		
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressInt8Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				P1[j] = SZ_INT8_MIN;
+			else
+				P1[j] = SZ_INT8_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				P1[index] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				P1[index] = SZ_INT8_MIN;
+			else
+				P1[index] = SZ_INT8_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P1[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P1[index] = SZ_INT8_MIN;
+				else
+					P1[index] = SZ_INT8_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				P0[0] = SZ_INT8_MIN;
+			else
+				P0[0] = SZ_INT8_MAX;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P0[j] = SZ_INT8_MIN;
+				else
+					P0[j] = SZ_INT8_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P0[index2D] = SZ_INT8_MIN;
+				else
+					P0[index2D] = SZ_INT8_MAX;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						P0[index2D] = SZ_INT8_MIN;
+					else
+						P0[index2D] = SZ_INT8_MAX;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		int8_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT8);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_int8_NoCkRngeNoGzip_3D(unsigned char** newByteData, int8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{	
+	TightDataPointStorageI* tdps = SZ_compress_int8_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(int8_t))
+		SZ_compress_args_int8_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_int8_4D_MDQ(int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_int8_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	int8_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (int8_t*)malloc(r34*sizeof(int8_t));
+	P1 = (int8_t*)malloc(r34*sizeof(int8_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	int8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressInt8Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				P1[index2D] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				P1[index2D] = SZ_INT8_MIN;
+			else
+				P1[index2D] = SZ_INT8_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P1[index2D] = SZ_INT8_MIN;
+				else
+					P1[index2D] = SZ_INT8_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P1[index2D] = SZ_INT8_MIN;
+				else
+					P1[index2D] = SZ_INT8_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						P1[index2D] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						P1[index2D] = SZ_INT8_MIN;
+					else
+						P1[index2D] = SZ_INT8_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					P0[index2D] = SZ_INT8_MIN;
+				else
+					P0[index2D] = SZ_INT8_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						P0[index2D] = SZ_INT8_MIN;
+					else
+						P0[index2D] = SZ_INT8_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						P0[index2D] = SZ_INT8_MIN;
+					else
+						P0[index2D] = SZ_INT8_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+							P0[index2D] = tmp;
+						else if(tmp < SZ_INT8_MIN)
+							P0[index2D] = SZ_INT8_MIN;
+						else
+							P0[index2D] = SZ_INT8_MAX;							
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressInt8Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			int8_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_INT8);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_int8_NoCkRngeNoGzip_4D(unsigned char** newByteData, int8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_int8_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(int8_t))
+		SZ_compress_args_int8_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_int8_withinRange(unsigned char** newByteData, int8_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char));
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 1;
+	
+	int8_t value = oriData[0];
+	//intToBytes_bigEndian(tdps->exactDataBytes, value);
+	memcpy(tdps->exactDataBytes, &value, 1);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(int8_t)+SZ_SIZE_TYPE; //8==3+1+4(int8_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_int8_wRngeNoGzip(unsigned char** newByteData, int8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	int8_t minValue = computeRangeSize_int(oriData, SZ_INT8, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int8_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_int8_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_int8(unsigned char** newByteData, int8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	int8_t minValue = (int8_t)computeRangeSize_int(oriData, SZ_INT8, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_int8_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_int8_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the int8_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+
+	return status;
+}
diff --git a/thirdparty/SZ/sz/src/sz_interface.F90 b/thirdparty/SZ/sz/src/sz_interface.F90
new file mode 100644
index 0000000000000000000000000000000000000000..300a1c59f39debcd31fce6ddac6479ed7a20b0ed
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_interface.F90
@@ -0,0 +1,1195 @@
+!  @file   sz_interface.F90
+!  @author Sheng Di (disheng222@gmail.com or sdi1@anl.gov)
+!  @date   June, 2016
+!  @ Mathematics and Computer Science (MCS)
+!  @ Argonne National Laboratory, Lemont, USA.
+!  @brief  The key Fortran binding file to connect C language and Fortran (Fortran part)
+
+
+MODULE SZ
+	use :: ISO_C_BINDING
+	INTERFACE SZ_Compress
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K4		
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K4_ARGS
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K4_ARGS
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K4_ARGS
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K4_ARGS
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K4_ARGS
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K8_ARGS
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K8_ARGS
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K8_ARGS
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K8_ARGS
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K8_ARGS
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K4_Rev
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K4_Rev
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K4_Rev
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K4_Rev
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K4_Rev	
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K4_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K4_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K4_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K4_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K4_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K8_Rev
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K8_Rev
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K8_Rev
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K8_Rev
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K8_Rev
+		MODULE PROCEDURE SZ_Compress_d1_Fortran_REAL_K8_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d2_Fortran_REAL_K8_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d3_Fortran_REAL_K8_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d4_Fortran_REAL_K8_ARGS_Rev
+		MODULE PROCEDURE SZ_Compress_d5_Fortran_REAL_K8_ARGS_Rev
+	END INTERFACE SZ_Compress
+
+	INTERFACE SZ_Decompress
+		MODULE PROCEDURE SZ_Decompress_d1_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Decompress_d2_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Decompress_d3_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Decompress_d4_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Decompress_d5_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_Decompress_d1_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Decompress_d2_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Decompress_d3_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Decompress_d4_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_Decompress_d5_Fortran_REAL_K8
+	END INTERFACE SZ_Decompress
+
+	INTERFACE SZ_BatchAddVar
+		MODULE PROCEDURE SZ_BatchAddVar_d1_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_BatchAddVar_d2_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_BatchAddVar_d3_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_BatchAddVar_d4_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_BatchAddVar_d5_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_BatchAddVar_d1_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_BatchAddVar_d2_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_BatchAddVar_d3_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_BatchAddVar_d4_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_BatchAddVar_d5_Fortran_REAL_K8
+	END INTERFACE SZ_BatchAddVar
+
+	INTERFACE SZ_GetVarData
+		MODULE PROCEDURE SZ_GetVarData_d1_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_GetVarData_d2_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_GetVarData_d3_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_GetVarData_d4_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_GetVarData_d5_Fortran_REAL_K4
+		MODULE PROCEDURE SZ_GetVarData_d1_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_GetVarData_d2_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_GetVarData_d3_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_GetVarData_d4_Fortran_REAL_K8
+		MODULE PROCEDURE SZ_GetVarData_d5_Fortran_REAL_K8								
+	END INTERFACE SZ_GetVarData
+
+	CONTAINS
+
+!Init and Finalize
+
+	SUBROUTINE SZ_Init(config_File,ierr)
+		implicit none
+		CHARACTER(len=32) :: config_File
+		INTEGER :: l,ierr
+		CALL SZ_Init_c(config_File,len(trim(config_File)),ierr)
+	END SUBROUTINE SZ_Init
+	
+	SUBROUTINE SZ_Finalize()
+		CALL SZ_Finalize_c()
+	END SUBROUTINE SZ_Finalize
+
+	SUBROUTINE SZ_FREE_VARSET(mode)
+		implicit none
+		INTEGER :: mode !0,1,2, or 3
+		
+		CALL SZ_Freevarset_c(mode)
+	END SUBROUTINE SZ_FREE_VARSET
+
+!batch-mode functions
+
+	SUBROUTINE SZ_BatchDelVar(varName, ierr)
+		implicit none
+		CHARACTER(len=*) :: varName
+		INTEGER :: ierr
+		CALL SZ_BatchDelVar_c(varName, len(trim(varName)), ierr)
+	END SUBROUTINE SZ_BatchDelVar
+
+	SUBROUTINE SZ_Batch_Compress(Bytes, OutSize)
+		implicit none
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		INTEGER(kind=C_SIZE_T) :: OutSize
+		INTEGER(kind=C_SIZE_T) :: alloSize
+		
+		CALL compute_total_batch_size_c(alloSize)
+		allocate(Bytes(alloSize)) !allocate the largest possible memory
+
+		CALL SZ_Batch_Compress_c(Bytes, OutSize)
+		
+	END SUBROUTINE SZ_Batch_Compress
+
+	SUBROUTINE SZ_Batch_Decompress(Bytes, OutSize, ierr)
+		implicit none
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		INTEGER(kind=C_SIZE_T) :: OutSize
+		INTEGER :: ierr
+
+		CALL SZ_Batch_Decompress_c(Bytes, OutSize, ierr)
+	END SUBROUTINE SZ_Batch_Decompress
+
+!Compress functions that extract the dimension sizes and call C translation interface (single-precision)
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1)) !allocate the largest possible memory
+			
+		CALL SZ_Compress_d1_Float(VAR, Bytes, OutSize, R1)
+	
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		INTEGER(kind=4) :: ErrBoundMode
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d1_Float_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1)
+	
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4_ARGS
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+
+		R = R1*R2
+		
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+	
+		CALL SZ_Compress_d2_Float(VAR, Bytes, OutSize, R1, R2)
+	
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		
+		R = R1*R2
+		
+		allocate(Bytes(8*R))  !allocate the largest possible memory
+		
+		CALL SZ_Compress_d2_Float_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2)
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4_ARGS
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d3_Float(VAR, Bytes, OutSize, R1, R2, R3)
+		
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d3_Float_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3)
+
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4_ARGS
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d4_Float(VAR, Bytes, OutSize, R1, R2, R3, R4)
+		
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d4_Float_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4)
+
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4_ARGS
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d5_Float(VAR, Bytes, OutSize, R1, R2, R3, R4, R5)
+
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d5_Float_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4, R5)
+
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4_ARGS
+
+!Compress functions that extract the dimension sizes and call C translation interfaces ï¼ˆdouble-precision)
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1))
+		
+		CALL SZ_Compress_d1_Double(VAR, Bytes, OutSize, R1)
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1))
+		CALL SZ_Compress_d1_Double_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1)
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8_ARGS
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		
+		R = R1*R2
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d2_Double(VAR, Bytes, OutSize, R1, R2)
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+
+		R = R1*R2
+		allocate(Bytes(8*R)) !allocate the largest possible memory		
+		
+		CALL SZ_Compress_d2_Double_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2)
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8_ARGS
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d3_Double(VAR, Bytes, OutSize, R1, R2, R3)
+	
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d3_Double_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3)
+
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8_ARGS
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d4_Double(VAR, Bytes, OutSize, R1, R2, R3, R4)
+	
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d4_Double_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4)
+
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8_ARGS
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8(VAR, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:,:) :: VAR
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d5_Double(VAR, Bytes, OutSize, R1, R2, R3, R4, R5)
+	
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8_ARGS(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d5_Double_Args(VAR, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4, R5)
+
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8_ARGS
+
+!Comrpession functions with reserved value
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1))
+		
+		CALL SZ_Compress_d1_Float_Rev(VAR, ReValue, Bytes, OutSize, R1)
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4_Rev
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1))
+		
+		CALL SZ_Compress_d1_Float_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1)
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K4_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		
+		R = R1*R2
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d2_Float_Rev(VAR, ReValue, Bytes, OutSize, R1, R2)
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4_Rev
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		
+		R = R1*R2
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d2_Float_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2)
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K4_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		INTEGER(kind=1), DIMENSION(:), allocatable :: temp
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d3_Float_Rev(VAR, ReValue, Bytes, OutSize, R1, R2, R3)
+
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4_Rev
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d3_Float_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3)
+
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K4_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d4_Float_Rev(VAR, ReValue, Bytes, OutSize, R1, R2, R3, R4)
+	
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4_Rev
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d4_Float_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4)
+
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K4_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d5_Float_Rev(VAR, ReValue, Bytes, OutSize, R1, R2, R3, R4, R5)
+		
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4_Rev
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=4), DIMENSION(:,:,:,:,:) :: VAR
+		REAL(KIND=4) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d5_Float_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4, R5)
+
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K4_ARGS_Rev
+
+!Compress functions that extract the dimension sizes and call C translation interfaces ï¼ˆdouble-precision)
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1))
+		
+		CALL SZ_Compress_d1_Double_Rev(VAR, ReValue, Bytes, OutSize, R1)
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8_Rev
+
+	SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		
+		allocate(Bytes(8*R1))
+		
+		CALL SZ_Compress_d1_Double_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1)
+	END SUBROUTINE SZ_Compress_d1_Fortran_REAL_K8_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		
+		R = R1*R2
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d2_Double_Rev(VAR, ReValue, Bytes, OutSize, R1, R2)
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8_Rev
+
+	SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R = R1*R2
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d2_Double_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2)
+
+	END SUBROUTINE SZ_Compress_d2_Fortran_REAL_K8_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d3_Double_Rev(VAR, ReValue, Bytes, OutSize, R1, R2, R3)
+
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8_Rev
+
+	SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R = R1*R2*R3
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d3_Double_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3)
+
+	END SUBROUTINE SZ_Compress_d3_Fortran_REAL_K8_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d4_Double_Rev(VAR, ReValue, Bytes, OutSize, R1, R2, R3, R4)
+	
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8_Rev
+
+	SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R = R1*R2*R3*R4
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d4_Double_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4)
+
+	END SUBROUTINE SZ_Compress_d4_Fortran_REAL_K8_ARGS_Rev
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8_Rev(VAR, ReValue, Bytes, OutSize)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+		
+		CALL SZ_Compress_d5_Double_Rev(VAR, ReValue, Bytes, OutSize, R1, R2, R3, R4, R5)
+
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8_Rev
+
+	SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8_ARGS_Rev(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		REAL(KIND=8), DIMENSION(:,:,:,:,:) :: VAR
+		REAL(KIND=8) :: ReValue
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: OutSize, R1, R2, R3, R4, R5, R
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		INTEGER(kind=1), DIMENSION(:), allocatable :: Bytes
+
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+		R = R1*R2*R3*R4*R5
+		allocate(Bytes(8*R)) !allocate the largest possible memory
+
+		CALL SZ_Compress_d5_Double_Rev_Args(VAR, ReValue, Bytes, OutSize, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4, R5)
+
+	END SUBROUTINE SZ_Compress_d5_Fortran_REAL_K8_ARGS_Rev
+
+!Decompress functions
+
+	SUBROUTINE SZ_Decompress_d1_Fortran_REAL_K4(Bytes, VAR, R1) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=4), DIMENSION(:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1))
+	
+		CALL SZ_Decompress_d1_Float(Bytes, BLength, VAR, R1)
+	END SUBROUTINE SZ_Decompress_d1_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Decompress_d2_Fortran_REAL_K4(Bytes, VAR, R1, R2) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes	
+		REAL(KIND=4), DIMENSION(:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2))
+	
+		CALL SZ_Decompress_d2_Float(Bytes, BLength, VAR, R1, R2)
+	END SUBROUTINE SZ_Decompress_d2_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Decompress_d3_Fortran_REAL_K4(Bytes, VAR, R1, R2, R3) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=4), DIMENSION(:,:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2,R3))
+	
+		CALL SZ_Decompress_d3_Float(Bytes, BLength, VAR, R1, R2, R3)
+	END SUBROUTINE SZ_Decompress_d3_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Decompress_d4_Fortran_REAL_K4(Bytes, VAR, R1, R2, R3, R4) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=4), DIMENSION(:,:,:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2,R3,R4))
+	
+		CALL SZ_Decompress_d4_Float(Bytes, BLength, VAR, R1, R2, R3, R4)
+	END SUBROUTINE SZ_Decompress_d4_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Decompress_d5_Fortran_REAL_K4(Bytes, VAR, R1, R2, R3, R4, R5) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=4), DIMENSION(:,:,:,:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4, R5, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2,R3,R4,R5))
+	
+		CALL SZ_Decompress_d5_Float(Bytes, BLength, VAR, R1, R2, R3, R4, R5)
+	END SUBROUTINE SZ_Decompress_d5_Fortran_REAL_K4
+
+	SUBROUTINE SZ_Decompress_d1_Fortran_REAL_K8(Bytes, VAR, R1) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=8), DIMENSION(:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1))
+	
+		CALL SZ_Decompress_d1_Double(Bytes, BLength, VAR, R1)
+	END SUBROUTINE SZ_Decompress_d1_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Decompress_d2_Fortran_REAL_K8(Bytes, VAR, R1, R2) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=8), DIMENSION(:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2))
+	
+		CALL SZ_Decompress_d2_Double(Bytes, BLength, VAR, R1, R2)
+	END SUBROUTINE SZ_Decompress_d2_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Decompress_d3_Fortran_REAL_K8(Bytes, VAR, R1, R2, R3) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=8), DIMENSION(:,:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2,R3))
+	
+		CALL SZ_Decompress_d3_Double(Bytes, BLength, VAR, R1, R2, R3)
+	END SUBROUTINE SZ_Decompress_d3_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Decompress_d4_Fortran_REAL_K8(Bytes, VAR, R1, R2, R3, R4) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=8), DIMENSION(:,:,:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4, BLength
+		BLength = SIZE(Bytes)
+		allocate(VAR(R1,R2,R3,R4))
+	
+		CALL SZ_Decompress_d4_Double(Bytes, BLength, VAR, R1, R2, R3, R4)
+	END SUBROUTINE SZ_Decompress_d4_Fortran_REAL_K8
+
+	SUBROUTINE SZ_Decompress_d5_Fortran_REAL_K8(Bytes, VAR, R1, R2, R3, R4, R5) 
+		implicit none
+		INTEGER(kind=1), DIMENSION(:) :: Bytes
+		REAL(KIND=8), DIMENSION(:,:,:,:,:), allocatable :: VAR
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4, R5, BLength
+		BLength = SIZE(Bytes, 1)
+		allocate(VAR(R1,R2,R3,R4,R5))
+	
+		CALL SZ_Decompress_d5_Double(Bytes, BLength, VAR, R1, R2, R3, R4, R5)
+	END SUBROUTINE SZ_Decompress_d5_Fortran_REAL_K8
+
+!--------batch add float
+
+	SUBROUTINE SZ_BatchAddVar_d1_Fortran_REAL_K4(varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		CHARACTER(len=*) :: varName
+		REAL(KIND=4), DIMENSION(:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+
+		CALL SZ_batchAddVar_d1_Float(varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1)
+	END SUBROUTINE SZ_BatchAddVar_d1_Fortran_REAL_K4
+
+	SUBROUTINE SZ_BatchAddVar_d2_Fortran_REAL_K4(varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none		
+		CHARACTER(len=*) :: varName
+		REAL(KIND=4), DIMENSION(:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+
+		CALL SZ_batchAddVar_d2_Float(varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2)
+	END SUBROUTINE SZ_BatchAddVar_d2_Fortran_REAL_K4
+
+	SUBROUTINE SZ_BatchAddVar_d3_Fortran_REAL_K4(varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none		
+		CHARACTER(len=*) :: varName
+		REAL(KIND=4), DIMENSION(:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+
+		CALL SZ_batchAddVar_d3_Float(varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3)
+	END SUBROUTINE SZ_BatchAddVar_d3_Fortran_REAL_K4
+
+	SUBROUTINE SZ_BatchAddVar_d4_Fortran_REAL_K4(varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		CHARACTER(len=*) :: varName
+		REAL(KIND=4), DIMENSION(:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+
+		CALL SZ_batchAddVar_d4_Float(varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4)
+	END SUBROUTINE SZ_BatchAddVar_d4_Fortran_REAL_K4
+
+	SUBROUTINE SZ_BatchAddVar_d5_Fortran_REAL_K4(varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		CHARACTER(len=*) :: varName
+		REAL(KIND=4), DIMENSION(:,:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4, R5
+		REAL(kind=4) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+
+		CALL SZ_batchAddVar_d5_Float(varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4, R5)
+	END SUBROUTINE SZ_BatchAddVar_d5_Fortran_REAL_K4
+
+!------batch add double
+	SUBROUTINE SZ_BatchAddVar_d1_Fortran_REAL_K8(varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		CHARACTER(len=*) :: varName
+		REAL(KIND=8), DIMENSION(:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+
+		CALL SZ_batchAddVar_d1_Double(varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1)
+	END SUBROUTINE SZ_BatchAddVar_d1_Fortran_REAL_K8
+
+	SUBROUTINE SZ_BatchAddVar_d2_Fortran_REAL_K8(varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		CHARACTER(len=*) :: varName
+		REAL(KIND=8), DIMENSION(:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+
+		CALL SZ_batchAddVar_d2_Double(varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2)
+	END SUBROUTINE SZ_BatchAddVar_d2_Fortran_REAL_K8
+
+	SUBROUTINE SZ_BatchAddVar_d3_Fortran_REAL_K8(varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		CHARACTER(len=*) :: varName
+		REAL(KIND=8), DIMENSION(:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+
+		CALL SZ_batchAddVar_d3_Double(varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3)
+	END SUBROUTINE SZ_BatchAddVar_d3_Fortran_REAL_K8
+
+	SUBROUTINE SZ_BatchAddVar_d4_Fortran_REAL_K8(varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		CHARACTER(len=*) :: varName
+		REAL(KIND=8), DIMENSION(:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+
+		CALL SZ_batchAddVar_d4_Double(varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4)
+	END SUBROUTINE SZ_BatchAddVar_d4_Fortran_REAL_K8
+
+	SUBROUTINE SZ_BatchAddVar_d5_Fortran_REAL_K8(varName, VAR, ErrBoundMode, AbsErrBound, RelBoundRatio)
+		implicit none
+		CHARACTER(len=*) :: varName
+		REAL(KIND=8), DIMENSION(:,:,:,:,:) :: VAR
+		INTEGER(kind=4) :: ErrBoundMode
+		INTEGER(kind=C_SIZE_T) :: R1, R2, R3, R4, R5
+		REAL(kind=8) :: AbsErrBound, RelBoundRatio
+		R1 = SIZE(VAR,1)
+		R2 = SIZE(VAR,2)
+		R3 = SIZE(VAR,3)
+		R4 = SIZE(VAR,4)
+		R5 = SIZE(VAR,5)
+
+		CALL SZ_batchAddVar_d5_Double(varName, len(trim(varName)), VAR, ErrBoundMode, AbsErrBound, RelBoundRatio, R1, R2, R3, R4, R5)
+	END SUBROUTINE SZ_BatchAddVar_d5_Fortran_REAL_K8
+
+	SUBROUTINE SZ_GetVarDim(varName, DIMEN, R1, R2, R3, R4, R5)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		INTEGER(kind=4), INTENT(OUT) :: DIMEN
+		INTEGER(kind=C_SIZE_T), INTENT(OUT) :: R1, R2, R3, R4, R5
+		
+		CALL SZ_GetVarDim_c(varName, len(trim(varName)), DIMEN, R1, R2, R3, R4, R5)
+		
+	END SUBROUTINE SZ_GetVarDim
+
+	SUBROUTINE SZ_GetVarData_d1_Fortran_REAL_K4(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=4), DIMENSION(:), allocatable :: VAR
+
+		CALL SZ_getVarData_Float(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d1_Fortran_REAL_K4
+	
+	SUBROUTINE SZ_GetVarData_d2_Fortran_REAL_K4(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=4), DIMENSION(:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Float(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d2_Fortran_REAL_K4	
+
+	SUBROUTINE SZ_GetVarData_d3_Fortran_REAL_K4(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=4), DIMENSION(:,:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Float(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d3_Fortran_REAL_K4	
+
+	SUBROUTINE SZ_GetVarData_d4_Fortran_REAL_K4(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=4), DIMENSION(:,:,:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Float(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d4_Fortran_REAL_K4	
+
+	SUBROUTINE SZ_GetVarData_d5_Fortran_REAL_K4(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=4), DIMENSION(:,:,:,:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Float(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d5_Fortran_REAL_K4
+
+	SUBROUTINE SZ_GetVarData_d1_Fortran_REAL_K8(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=8), DIMENSION(:), allocatable :: VAR
+
+		CALL SZ_getVarData_Double(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d1_Fortran_REAL_K8
+	
+	SUBROUTINE SZ_GetVarData_d2_Fortran_REAL_K8(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=8), DIMENSION(:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Double(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d2_Fortran_REAL_K8	
+
+	SUBROUTINE SZ_GetVarData_d3_Fortran_REAL_K8(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=8), DIMENSION(:,:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Double(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d3_Fortran_REAL_K8	
+
+	SUBROUTINE SZ_GetVarData_d4_Fortran_REAL_K8(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=8), DIMENSION(:,:,:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Double(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d4_Fortran_REAL_K8	
+
+	SUBROUTINE SZ_GetVarData_d5_Fortran_REAL_K8(varName, VAR)
+		implicit none
+		CHARACTER(len=*), INTENT(IN) :: varName
+		REAL(KIND=8), DIMENSION(:,:,:,:,:), allocatable :: VAR
+
+		CALL SZ_getVarData_Double(varName, len(trim(varName)), VAR)
+		
+	END SUBROUTINE SZ_GetVarData_d5_Fortran_REAL_K8
+
+END MODULE SZ
diff --git a/thirdparty/SZ/sz/src/sz_omp.c b/thirdparty/SZ/sz/src/sz_omp.c
new file mode 100644
index 0000000000000000000000000000000000000000..147245bab21aae1509a5eb712fa533f38ab35de3
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_omp.c
@@ -0,0 +1,520 @@
+/**
+ *  @file sz_omp.c
+ *  @author Xin Liang
+ *  @date July, 2017
+ *  @brief the implementation of openMP version
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include "sz_omp.h"
+#include <math.h>
+#include <time.h>
+
+unsigned char * SZ_compress_float_1D_MDQ_openmp(float *oriData, size_t r1, double realPrecision, size_t * comp_size){
+	return NULL;
+}
+unsigned char * SZ_compress_float_2D_MDQ_openmp(float *oriData, size_t r1, size_t r2, double realPrecision, size_t * comp_size){
+	return NULL;
+}
+
+unsigned char * SZ_compress_float_3D_MDQ_openmp(float *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t * comp_size){
+
+	double elapsed_time = 0.0;
+
+	elapsed_time = -omp_get_wtime();
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		// quantization_intervals = optimize_intervals_float_3D(oriData, r1, realPrecision);
+		quantization_intervals = optimize_intervals_float_3D_opt(oriData, r1, r2, r3, realPrecision);
+		//quantization_intervals = 32768;
+		printf("3D number of bins: %d\nerror bound %.20f\n", quantization_intervals, realPrecision);
+		// exit(0);		
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else{
+		quantization_intervals = exe_params->intvCapacity;
+	}
+	elapsed_time += omp_get_wtime();
+	printf("opt interval time: %.4f\n", elapsed_time);
+
+	elapsed_time = -omp_get_wtime();
+	int thread_num = omp_get_max_threads();
+	int thread_order = (int)log2(thread_num);
+	size_t num_x = 0, num_y = 0, num_z = 0;
+	{
+		int block_thread_order = thread_order / 3;
+		switch(thread_order % 3){
+			case 0:{
+				num_x = 1 << block_thread_order;
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 1:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 2:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << (block_thread_order + 1);
+				num_z = 1 << block_thread_order;
+				break;
+			}
+		}
+		thread_num = num_x * num_y * num_z;
+	}
+	omp_set_num_threads(thread_num);
+	// calculate block dims
+	printf("number of blocks: %zu %zu %zu\n", num_x, num_y, num_z);
+
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t max_num_block_elements = early_blockcount_x * early_blockcount_y * early_blockcount_z;
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t num_elements = r1 * r2 * r3;
+	// printf("max_num_block_elements %d num_blocks %d\n", max_num_block_elements, num_blocks);
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	
+	// printf("malloc blockinfo array start\n");
+	// fflush(stdout);
+
+	size_t buffer_size = early_blockcount_y * early_blockcount_z * sizeof(float);
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t unpred_data_max_size = max_num_block_elements;
+	float * result_unpredictable_data = (float *) malloc(unpred_data_max_size * sizeof(float) * num_blocks);
+	unsigned int * unpredictable_count = (unsigned int *) malloc(num_blocks * sizeof(unsigned int));
+	float * mean = malloc(num_blocks * sizeof(float));
+	float * buffer0, * buffer1;
+	buffer0 = (float *) malloc(buffer_size * thread_num);
+	buffer1 = (float *) malloc(buffer_size * thread_num);
+	unsigned char * result = (unsigned char *) malloc(num_elements * (sizeof(int) + sizeof(float)));
+	size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	unsigned char * encoding_buffer = (unsigned char *) malloc(max_num_block_elements * sizeof(int) * num_blocks);
+	size_t * block_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	size_t *freq = (size_t *)malloc(thread_num*quantization_intervals*4*sizeof(size_t));
+	memset(freq, 0, thread_num*quantization_intervals*4*sizeof(size_t));
+	
+	size_t stateNum = quantization_intervals*2;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);	
+	
+	int num_yz = num_y * num_z;
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = omp_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		// printf("%d: %d %d %d\n", omp_get_thread_num(), i, j, k);
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		float * data_pos = oriData + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+
+		float * unpredictable_data = result_unpredictable_data + id * unpred_data_max_size;
+		float *P0, *P1; // buffer
+		// P0 = (float *) malloc(buffer_size);
+		// P1 = (float *) malloc(buffer_size);
+		P0 = buffer0 + id * early_blockcount_y * early_blockcount_z;
+		P1 = buffer1 + id * early_blockcount_y * early_blockcount_z;
+		unpredictable_count[id] = SZ_compress_float_3D_MDQ_RA_block(data_pos, mean + id, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, realPrecision, P0, P1, type, unpredictable_data);
+		// free(P0);
+		// free(P1);
+	}
+	elapsed_time += omp_get_wtime();
+	printf("compression and quantization time: %.4f\n", elapsed_time);
+	elapsed_time = -omp_get_wtime();
+	// printf("unpred count:\n");
+	// for(int i=0; i<num_blocks; i++){
+	// 	printf("%d ", unpredictable_count[i]);
+	// }
+	// printf("\n");
+	// printf("total_unpred num: %d\n", total_unpred);
+	// printf("Block wise compression end, num_elements %ld\n", num_elements);
+	// huffman encode
+
+	size_t nodeCount = 0;
+	Huffman_init_openmp(huffmanTree, result_type, num_elements, thread_num, freq);
+	elapsed_time += omp_get_wtime();
+	printf("Build Huffman: %.4f\n", elapsed_time);
+	elapsed_time = -omp_get_wtime();
+	for (size_t i = 0; i < stateNum; i++)
+		if (huffmanTree->code[i]) nodeCount++;
+	nodeCount = nodeCount*2-1;
+	unsigned char *treeBytes;
+	unsigned int treeByteSize = convert_HuffTree_to_bytes_anyStates(huffmanTree, nodeCount, &treeBytes);
+
+	unsigned int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	size_t total_unpred = 0;
+	for(int i=0; i<num_blocks; i++){
+		total_unpred += unpredictable_count[i];
+		// printf("%d: %d mean %.2f\n", i, unpredictable_count[i], mean[i]);
+	}
+	unsigned char * result_pos = result;
+	initRandomAccessBytes(result_pos);
+	result_pos += meta_data_offset;
+
+	size_t enCodeSize = 0;
+
+	intToBytes_bigEndian(result_pos, thread_num);
+	result_pos += 4;
+	doubleToBytes(result_pos, realPrecision);
+	result_pos += 8;
+	intToBytes_bigEndian(result_pos, quantization_intervals);
+	result_pos += 4;
+	intToBytes_bigEndian(result_pos, treeByteSize);
+	result_pos += 4;
+	intToBytes_bigEndian(result_pos, nodeCount);
+	result_pos += 4;
+	memcpy(result_pos, treeBytes, treeByteSize);
+	result_pos += treeByteSize;
+
+	memcpy(result_pos, unpredictable_count, num_blocks * sizeof(unsigned int));
+	result_pos += num_blocks * sizeof(unsigned int);
+	memcpy(result_pos, mean, num_blocks * sizeof(float));
+	result_pos += num_blocks * sizeof(float);	
+	// printf("unpred offset: %ld\n", result_pos - result);
+	// store unpredicable data
+	// float * unpred_pos = (float *) result_pos;
+	// for(int t=0; t<thread_num; t++){
+	// 	float * unpredictable_data = result_unpredictable_data + t * unpred_data_max_size;
+	// 	memcpy(result_pos, unpredictable_data, unpredictable_count[t] * sizeof(float));		
+	// 	result_pos += unpredictable_count[t]*sizeof(float);
+	// }
+	unpred_offset[0] = 0;
+	for(int t=1; t<thread_num; t++){
+		unpred_offset[t] = unpredictable_count[t-1] + unpred_offset[t-1];
+	}
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = omp_get_thread_num();
+		float * unpredictable_data = result_unpredictable_data + id * unpred_data_max_size;
+		memcpy(result_pos + unpred_offset[id] * sizeof(float), unpredictable_data, unpredictable_count[id] * sizeof(float));		
+	}
+	result_pos += total_unpred * sizeof(float);
+
+	elapsed_time += omp_get_wtime();
+	printf("write misc time: %.4f\n", elapsed_time);
+	elapsed_time = -omp_get_wtime();
+
+	size_t * block_pos = (size_t *) result_pos;
+	result_pos += num_blocks * sizeof(size_t);
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = omp_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		unsigned char * encoding_buffer_pos = encoding_buffer + id * max_num_block_elements * sizeof(int);
+		size_t enCodeSize = 0;
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t current_block_elements = current_blockcount_x * current_blockcount_y * current_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+		encode(huffmanTree, type, current_block_elements, encoding_buffer_pos, &enCodeSize);
+		block_pos[id] = enCodeSize;
+	}
+	elapsed_time += omp_get_wtime();
+	printf("Parallel Huffman encoding elapsed time: %.4f\n", elapsed_time);
+	elapsed_time = -omp_get_wtime();
+	// for(int t=0; t<thread_num; t++){
+	// 	memcpy(result_pos, encoding_buffer + t * max_num_block_elements * sizeof(int), block_pos[t]);
+	// 	result_pos += block_pos[t];
+	// }
+	block_offset[0] = 0;
+	for(int t=1; t<thread_num; t++){
+		block_offset[t] = block_pos[t-1] + block_offset[t-1];
+	}
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = omp_get_thread_num();
+		memcpy(result_pos + block_offset[id], encoding_buffer + t * max_num_block_elements * sizeof(int), block_pos[t]);		
+	}
+	result_pos += block_offset[thread_num - 1] + block_pos[thread_num - 1];
+
+	elapsed_time += omp_get_wtime();
+	printf("Final copy elapsed time: %.4f\n", elapsed_time);
+	// {
+	// 	int status;
+	// 	writeIntData_inBytes(result_type, num_elements, "/Users/LiangXin/github/SZ-develop/example/openmp/comp001_type.dat", &status);
+	// }
+
+	// int status;
+	// writeIntData_inBytes(result_type, num_elements, "/Users/LiangXin/github/SZ-develop/example/openmp/omp_type.dat", &status);
+	// printf("type array size: %ld\n", enCodeSize);
+	result_pos += enCodeSize;
+	size_t totalEncodeSize = 0;
+	totalEncodeSize = result_pos - result;
+	// printf("Total size %ld\n", totalEncodeSize);
+	free(freq);
+	free(buffer0);
+	free(buffer1);
+	free(treeBytes);
+	free(unpred_offset);
+	free(block_offset);
+	free(encoding_buffer);
+	free(mean);
+	free(result_unpredictable_data);
+	free(unpredictable_count);
+	free(result_type);
+	SZ_ReleaseHuffman(huffmanTree);
+
+	*comp_size = totalEncodeSize;
+	return result;
+}
+
+void decompressDataSeries_float_1D_openmp(float** data, size_t r1, unsigned char* comp_data){
+}
+void decompressDataSeries_float_2D_openmp(float** data, size_t r1, size_t r2, unsigned char* comp_data){
+}
+
+
+void decompressDataSeries_float_3D_openmp(float** data, size_t r1, size_t r2, size_t r3, unsigned char* comp_data){
+	
+	if(confparams_dec==NULL)
+		confparams_dec = (sz_params*)malloc(sizeof(sz_params));
+	memset(confparams_dec, 0, sizeof(sz_params));
+	if(exe_params==NULL)
+		exe_params = (sz_exedata*)malloc(sizeof(sz_exedata));
+	memset(exe_params, 0, sizeof(sz_exedata));	
+	
+	// printf("num_block_elements %d num_blocks %d\n", max_num_block_elements, num_blocks);
+	// fflush(stdout);
+	double elapsed_time = 0.0;
+	elapsed_time = -omp_get_wtime();
+
+	size_t dim0_offset = r2 * r3;
+	size_t dim1_offset = r3;
+	size_t num_elements = r1 * r2 * r3;
+	
+	unsigned char * comp_data_pos = comp_data;
+	//int meta_data_offset = 3 + 1 + MetaDataByteLength;
+	//comp_data_pos += meta_data_offset;
+
+	int thread_num = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += 4;
+	int thread_order = (int)log2(thread_num);
+	size_t num_x = 0, num_y = 0, num_z = 0;
+	{
+		int block_thread_order = thread_order / 3;
+		switch(thread_order % 3){
+			case 0:{
+				num_x = 1 << block_thread_order;
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 1:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << block_thread_order;
+				num_z = 1 << block_thread_order;
+				break;
+			}
+			case 2:{
+				num_x = 1 << (block_thread_order + 1);
+				num_y = 1 << (block_thread_order + 1);
+				num_z = 1 << block_thread_order;
+				break;
+			}
+		}
+	}
+	printf("number of blocks: %zu %zu %zu, thread_num %d\n", num_x, num_y, num_z, thread_num);
+	omp_set_num_threads(thread_num);
+	size_t split_index_x, split_index_y, split_index_z;
+	size_t early_blockcount_x, early_blockcount_y, early_blockcount_z;
+	size_t late_blockcount_x, late_blockcount_y, late_blockcount_z;
+	SZ_COMPUTE_BLOCKCOUNT(r1, num_x, split_index_x, early_blockcount_x, late_blockcount_x);
+	SZ_COMPUTE_BLOCKCOUNT(r2, num_y, split_index_y, early_blockcount_y, late_blockcount_y);
+	SZ_COMPUTE_BLOCKCOUNT(r3, num_z, split_index_z, early_blockcount_z, late_blockcount_z);
+
+	size_t num_blocks = num_x * num_y * num_z;
+	size_t * unpred_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+	*data = (float*)malloc(sizeof(float)*num_elements);
+	int * result_type = (int *) malloc(num_elements * sizeof(int));
+	size_t * block_offset = (size_t *) malloc(num_blocks * sizeof(size_t));
+
+	double realPrecision = bytesToDouble(comp_data_pos);
+	comp_data_pos += 8;
+	unsigned int intervals = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += 4;
+
+	size_t stateNum = intervals*2;
+	HuffmanTree* huffmanTree = createHuffmanTree(stateNum);
+
+	updateQuantizationInfo(intervals);
+	// exe_params->intvRadius = (int)((tdps->intervals - 1)/ 2);
+
+	unsigned int tree_size = bytesToInt_bigEndian(comp_data_pos);
+	comp_data_pos += 4;
+	size_t huffman_nodes = bytesToInt_bigEndian(comp_data_pos);
+	huffmanTree->allNodes = huffman_nodes;
+	// printf("Reconstruct huffman tree with node count %ld\n", nodeCount);
+	// fflush(stdout);
+	node root = reconstruct_HuffTree_from_bytes_anyStates(huffmanTree, comp_data_pos+4, huffmanTree->allNodes);
+
+	comp_data_pos += 4 + tree_size;
+	unsigned int * unpred_count = (unsigned int *) comp_data_pos;
+	comp_data_pos += num_blocks * sizeof(unsigned int);
+	float * mean_pos = (float *) comp_data_pos;
+	comp_data_pos += num_blocks * sizeof(float);
+	float * result_unpredictable_data = (float *) comp_data_pos;
+	size_t total_unpred = 0;
+	for(int i=0; i<num_blocks; i++){
+		unpred_offset[i] = total_unpred;
+		total_unpred += unpred_count[i];
+	}
+	comp_data_pos += total_unpred * sizeof(float);
+
+	// printf("unpred count:\n");
+	// for(int i=0; i<num_blocks; i++){
+	// 	printf("%d ", unpred_count[i]);
+	// }
+	// printf("\n");
+	// for(int i=0; i<1000; i++){
+	// 	printf("%.2f ", result_unpredictable_data[i]);
+	// }
+	// printf("\ntotal_unpred num: %d\n", total_unpred);
+	
+	// for(int i=0; i<num_blocks; i++){
+	// 	printf("%d unpred offset %ld\n", i, unpred_offset[i]);
+	// 	for(int tmp=0; tmp<10; tmp++){
+	// 		printf("%.2f ", (result_unpredictable_data + unpred_offset[i])[tmp]);
+	// 	}
+	// 	printf("\n");
+	// }
+	// exit(0);
+	// printf("Block wise decompression start: %d %d %d\n", early_blockcount_x, early_blockcount_y, early_blockcount_z);
+	// fflush(stdout);
+	// decode(comp_data_pos, num_elements, root, result_type);
+	size_t * block_pos = (size_t *) comp_data_pos;
+	comp_data_pos += num_blocks * sizeof(size_t);
+	block_offset[0] = 0;
+	for(int t=1; t<thread_num; t++){
+		block_offset[t] = block_pos[t-1] + block_offset[t-1];
+	}
+	int num_yz = num_y * num_z;
+	elapsed_time += omp_get_wtime();
+	printf("Read data info elapsed time: %.4f\n", elapsed_time);
+	elapsed_time = -omp_get_wtime();
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = omp_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+		decode(comp_data_pos + block_offset[id], current_blockcount_x*current_blockcount_y*current_blockcount_z, root, type);
+	}
+	elapsed_time += omp_get_wtime();
+	printf("Parallel Huffman decoding elapsed time: %.4f\n", elapsed_time);
+	elapsed_time = -omp_get_wtime();
+
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = omp_get_thread_num();
+		int i = id/(num_yz);
+		int j = (id % num_yz) / num_z;
+		int k = id % num_z;
+		// printf("%d: %d %d %d\n", omp_get_thread_num(), i, j, k);
+		size_t offset_x = (i < split_index_x) ? i * early_blockcount_x : i * late_blockcount_x + split_index_x;
+		size_t offset_y = (j < split_index_y) ? j * early_blockcount_y : j * late_blockcount_y + split_index_y;
+		size_t offset_z = (k < split_index_z) ? k * early_blockcount_z : k * late_blockcount_z + split_index_z;
+		float * data_pos = *data + offset_x * dim0_offset + offset_y * dim1_offset + offset_z;
+
+		size_t current_blockcount_x = (i < split_index_x) ? early_blockcount_x : late_blockcount_x;
+		size_t current_blockcount_y = (j < split_index_y) ? early_blockcount_y : late_blockcount_y;
+		size_t current_blockcount_z = (k < split_index_z) ? early_blockcount_z : late_blockcount_z;
+		size_t type_offset = offset_x * dim0_offset +  offset_y * current_blockcount_x * dim1_offset + offset_z * current_blockcount_x * current_blockcount_y;
+		int * type = result_type + type_offset;
+
+		float * unpredictable_data = result_unpredictable_data + unpred_offset[id];
+		float mean = mean_pos[id];
+		// printf("\n%d\ndata_offset: %ld\n", t, offset_x * dim0_offset + offset_y * dim1_offset + offset_z);
+		// printf("mean: %.2f\n", mean);
+		// for(int tmp=0; tmp<10; tmp++){
+		// 	printf("%.2f ", unpredictable_data[tmp]);
+		// }
+		// printf("\n\n");
+		decompressDataSeries_float_3D_RA_block(data_pos, mean, r1, r2, r3, current_blockcount_x, current_blockcount_y, current_blockcount_z, realPrecision, type, unpredictable_data);
+	}	
+	elapsed_time += omp_get_wtime();
+	printf("Parallel decompress elapsed time: %.4f\n", elapsed_time);
+
+	free(block_offset);
+	free(result_type);
+	free(unpred_offset);
+	SZ_ReleaseHuffman(huffmanTree);
+}
+
+void Huffman_init_openmp(HuffmanTree* huffmanTree, int *s, size_t length, int thread_num, size_t * freq){
+
+	size_t i;
+	// size_t *freq = (size_t *)malloc(thread_num*huffmanTree->allNodes*sizeof(size_t));
+	// memset(freq, 0, thread_num*huffmanTree->allNodes*sizeof(size_t));
+	size_t block_size = (length - 1)/ thread_num + 1;
+	size_t block_residue = length - (thread_num - 1) * block_size;
+	#pragma omp parallel for
+	for(int t=0; t<thread_num; t++){
+		int id = omp_get_thread_num();
+		int * s_pos = s + id * block_size;
+		size_t * freq_pos = freq + id * huffmanTree->allNodes;
+		if(id < thread_num - 1){
+			for(size_t i=0; i<block_size; i++){
+				freq_pos[s_pos[i]] ++;
+			}
+		}
+		else{
+			for(size_t i=0; i<block_residue; i++){
+				freq_pos[s_pos[i]] ++;
+			}
+		}
+	}
+	size_t * freq_pos = freq + huffmanTree->allNodes;
+	for(int t=1; t<thread_num; t++){
+		for(i = 0; i<huffmanTree->allNodes; i++){
+			freq[i] += freq_pos[i];
+		}
+		freq_pos += huffmanTree->allNodes;
+	}
+
+	for (i = 0; i < huffmanTree->allNodes; i++)
+		if (freq[i]) 
+			qinsert(huffmanTree, new_node(huffmanTree, freq[i], i, 0, 0));
+ 
+	while (huffmanTree->qend > 2) 
+		qinsert(huffmanTree, new_node(huffmanTree, 0, 0, qremove(huffmanTree), qremove(huffmanTree)));
+ 
+	build_code(huffmanTree, huffmanTree->qq[1], 0, 0, 0);
+	// free(freq);
+}
+
+
+
diff --git a/thirdparty/SZ/sz/src/sz_uint16.c b/thirdparty/SZ/sz/src/sz_uint16.c
new file mode 100644
index 0000000000000000000000000000000000000000..4200b31651216119d09dee8d81b385d70d498ef5
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_uint16.c
@@ -0,0 +1,1383 @@
+/**
+ *  @file sz_uint16.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_uint16, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_uint16.h"
+
+unsigned int optimize_intervals_uint16_1D(uint16_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint16_2D(uint16_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint16_3D(uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_uint16_4D(uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_uint16_1D_MDQ(uint16_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_uint16_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	uint16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressUInt16Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressUInt16Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int64_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			if(pred>SZ_UINT16_MAX) pred = SZ_UINT16_MAX;
+			if(pred<SZ_UINT16_MIN) pred = SZ_UINT16_MIN;			
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressUInt16Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT16);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_uint16_StoreOriData(uint16_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(uint16_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];	
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;	
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			int16ToBytes_bigEndian(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_uint16_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint16_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint16_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint16_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(uint16_t))
+		SZ_compress_args_uint16_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_uint16_2D_MDQ(uint16_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint16_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	uint16_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (uint16_t*)malloc(r2*sizeof(uint16_t));
+	memset(P0, 0, r2*sizeof(uint16_t));
+	P1 = (uint16_t*)malloc(r2*sizeof(uint16_t));
+	memset(P1, 0, r2*sizeof(uint16_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	uint16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressUInt16Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_UINT16_MIN)
+			P1[1] = SZ_UINT16_MIN;
+		else
+			P1[1] = SZ_UINT16_MAX;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt16Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				P1[j] = SZ_UINT16_MIN;
+			else
+				P1[j] = SZ_UINT16_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				P0[0] = SZ_UINT16_MIN;
+			else
+				P0[0] = SZ_UINT16_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P0[j] = SZ_UINT16_MIN;
+				else
+					P0[j] = SZ_UINT16_MAX;						
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		uint16_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT16);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_uint16_NoCkRngeNoGzip_2D(unsigned char** newByteData, uint16_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, uint16_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint16_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(uint16_t))
+		SZ_compress_args_uint16_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_uint16_3D_MDQ(uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint16_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	uint16_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (uint16_t*)malloc(r23*sizeof(uint16_t));
+	P1 = (uint16_t*)malloc(r23*sizeof(uint16_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressUInt16Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_UINT16_MIN)
+			P1[1] = SZ_UINT16_MIN;
+		else
+			P1[1] = SZ_UINT16_MAX;		
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt16Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				P1[j] = SZ_UINT16_MIN;
+			else
+				P1[j] = SZ_UINT16_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				P1[index] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				P1[index] = SZ_UINT16_MIN;
+			else
+				P1[index] = SZ_UINT16_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressUInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P1[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P1[index] = SZ_UINT16_MIN;
+				else
+					P1[index] = SZ_UINT16_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				P0[0] = SZ_UINT16_MIN;
+			else
+				P0[0] = SZ_UINT16_MAX;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P0[j] = SZ_UINT16_MIN;
+				else
+					P0[j] = SZ_UINT16_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P0[index2D] = SZ_UINT16_MIN;
+				else
+					P0[index2D] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						P0[index2D] = SZ_UINT16_MIN;
+					else
+						P0[index2D] = SZ_UINT16_MAX;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressUInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		uint16_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT16);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_uint16_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint16_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{	
+	TightDataPointStorageI* tdps = SZ_compress_uint16_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(uint16_t))
+		SZ_compress_args_uint16_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_uint16_4D_MDQ(uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint16_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	uint16_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (uint16_t*)malloc(r34*sizeof(uint16_t));
+	P1 = (uint16_t*)malloc(r34*sizeof(uint16_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint16_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressUInt16Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				P1[index2D] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				P1[index2D] = SZ_UINT16_MIN;
+			else
+				P1[index2D] = SZ_UINT16_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressUInt16Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P1[index2D] = SZ_UINT16_MIN;
+				else
+					P1[index2D] = SZ_UINT16_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P1[index2D] = SZ_UINT16_MIN;
+				else
+					P1[index2D] = SZ_UINT16_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						P1[index2D] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						P1[index2D] = SZ_UINT16_MIN;
+					else
+						P1[index2D] = SZ_UINT16_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressUInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					P0[index2D] = SZ_UINT16_MIN;
+				else
+					P0[index2D] = SZ_UINT16_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressUInt16Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						P0[index2D] = SZ_UINT16_MIN;
+					else
+						P0[index2D] = SZ_UINT16_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						P0[index2D] = SZ_UINT16_MIN;
+					else
+						P0[index2D] = SZ_UINT16_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt16Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+							P0[index2D] = tmp;
+						else if(tmp < SZ_UINT16_MIN)
+							P0[index2D] = SZ_UINT16_MIN;
+						else
+							P0[index2D] = SZ_UINT16_MAX;							
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressUInt16Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			uint16_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT16);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_uint16_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint16_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint16_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(uint16_t))
+		SZ_compress_args_uint16_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_uint16_withinRange(unsigned char** newByteData, uint16_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char)*2);
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 2;
+	
+	uint16_t value = oriData[0];
+	int16ToBytes_bigEndian(tdps->exactDataBytes, value);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(uint16_t)+SZ_SIZE_TYPE; //8==3+1+4(uint16_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_uint16_wRngeNoGzip(unsigned char** newByteData, uint16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	uint16_t minValue = computeRangeSize_int(oriData, SZ_UINT16, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint16_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_uint16_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_uint16(unsigned char** newByteData, uint16_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	uint16_t minValue = (uint16_t)computeRangeSize_int(oriData, SZ_UINT16, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint16_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_uint16_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the uint16_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/thirdparty/SZ/sz/src/sz_uint32.c b/thirdparty/SZ/sz/src/sz_uint32.c
new file mode 100644
index 0000000000000000000000000000000000000000..29d596165910a4fd476179d73f1d917706a4c1d5
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_uint32.c
@@ -0,0 +1,1267 @@
+/**
+ *  @file sz_uint32.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_uint32, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_uint32.h"
+
+unsigned int optimize_intervals_uint32_1D(uint32_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint32_2D(uint32_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = r1*r2/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint32_3D(uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_uint32_4D(uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_uint32_1D_MDQ(uint32_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_uint32_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	uint32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressUInt32Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressUInt32Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	uint32_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+//		if(i==2869438)
+//			printf("i=%d\n", i);
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+/*			if(type[i]==0)
+				printf("err:type[%d]=0\n", i);*/
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressUInt32Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT32);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_uint32_StoreOriData(uint32_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(uint32_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;		
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			int32ToBytes_bigEndian(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_uint32_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint32_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint32_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint32_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(uint32_t))
+		SZ_compress_args_uint32_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_uint32_2D_MDQ(uint32_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint32_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	uint32_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (uint32_t*)malloc(r2*sizeof(uint32_t));
+	memset(P0, 0, r2*sizeof(uint32_t));
+	P1 = (uint32_t*)malloc(r2*sizeof(uint32_t));
+	memset(P1, 0, r2*sizeof(uint32_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	uint32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressUInt32Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt32Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		uint32_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT32);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_uint32_NoCkRngeNoGzip_2D(unsigned char** newByteData, uint32_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, uint32_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint32_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(uint32_t))
+		SZ_compress_args_uint32_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_uint32_3D_MDQ(uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint32_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	uint32_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (uint32_t*)malloc(r23*sizeof(uint32_t));
+	P1 = (uint32_t*)malloc(r23*sizeof(uint32_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressUInt32Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt32Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressUInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+/*				if(type[index]==0)
+					printf("err:type[%d]=0, index4\n", index);					*/
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressUInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		uint32_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT32);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_uint32_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint32_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint32_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(uint32_t))
+		SZ_compress_args_uint32_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_uint32_4D_MDQ(uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint32_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	uint32_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (uint32_t*)malloc(r34*sizeof(uint32_t));
+	P1 = (uint32_t*)malloc(r34*sizeof(uint32_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint32_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressUInt32Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressUInt32Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P1[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressUInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressUInt32Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt32Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressUInt32Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			uint32_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT32);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_uint32_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint32_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint32_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(uint32_t))
+		SZ_compress_args_uint32_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_uint32_withinRange(unsigned char** newByteData, uint32_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char)*4);
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 4;
+	
+	uint32_t value = oriData[0];
+	int32ToBytes_bigEndian(tdps->exactDataBytes, value);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(uint32_t)+SZ_SIZE_TYPE; //8==3+1+4(uint32_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_uint32_wRngeNoGzip(unsigned char** newByteData, uint32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	uint32_t minValue = computeRangeSize_int(oriData, SZ_UINT32, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint32_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_uint32_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_uint32(unsigned char** newByteData, uint32_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	uint32_t minValue = (uint32_t)computeRangeSize_int(oriData, SZ_UINT32, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint32_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_uint32_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the uint32_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/thirdparty/SZ/sz/src/sz_uint64.c b/thirdparty/SZ/sz/src/sz_uint64.c
new file mode 100644
index 0000000000000000000000000000000000000000..b8cb8bc3d1e588ff8af9e91b8036ee83a332f274
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_uint64.c
@@ -0,0 +1,1267 @@
+/**
+ *  @file sz_uint64.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_uint64, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_uint64.h"
+
+unsigned int optimize_intervals_uint64_1D(uint64_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - (int64_t)(oriData[i]));
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint64_2D(uint64_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - (int64_t)(oriData[index]));
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint64_3D(uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - (int64_t)(oriData[index]));
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_uint64_4D(uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - (int64_t)(oriData[index]));
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_uint64_1D_MDQ(uint64_t *oriData, size_t dataLength, double realPrecision, uint64_t valueRangeSize, uint64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_uint64_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	uint64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressUInt64Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressUInt64Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int64_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+//		if(i==2869438)
+//			printf("i=%d\n", i);
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+/*			if(type[i]==0)
+				printf("err:type[%d]=0\n", i);*/
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressUInt64Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT64);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_uint64_StoreOriData(uint64_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(uint64_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;		
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			int64ToBytes_bigEndian(p, oriData[i]);
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_uint64_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint64_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, uint64_t valueRangeSize, uint64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint64_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(uint64_t))
+		SZ_compress_args_uint64_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_uint64_2D_MDQ(uint64_t *oriData, size_t r1, size_t r2, double realPrecision, uint64_t valueRangeSize, uint64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint64_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	uint64_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (uint64_t*)malloc(r2*sizeof(uint64_t));
+	memset(P0, 0, r2*sizeof(uint64_t));
+	P1 = (uint64_t*)malloc(r2*sizeof(uint64_t));
+	memset(P1, 0, r2*sizeof(uint64_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	uint64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressUInt64Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = (int64_t)(spaceFillingValue[1]) - (int64_t)(pred1D);
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt64Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = (int64_t)(spaceFillingValue[j]) - (int64_t)(pred1D);
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred1D);
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		uint64_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT64);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_uint64_NoCkRngeNoGzip_2D(unsigned char** newByteData, uint64_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, uint64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint64_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(uint64_t))
+		SZ_compress_args_uint64_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_uint64_3D_MDQ(uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, uint64_t valueRangeSize, uint64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint64_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	uint64_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (uint64_t*)malloc(r23*sizeof(uint64_t));
+	P1 = (uint64_t*)malloc(r23*sizeof(uint64_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressUInt64Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = (int64_t)(spaceFillingValue[1]) - (int64_t)(pred1D);
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		P1[1] = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt64Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = (int64_t)(spaceFillingValue[j]) - (int64_t)(pred1D);
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[j] = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred1D);
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressUInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred1D);
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P0[0] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[j] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+/*				if(type[index]==0)
+					printf("err:type[%d]=0, index4\n", index);					*/
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred3D);
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressUInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		uint64_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT64);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_uint64_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint64_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+uint64_t valueRangeSize, uint64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint64_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(uint64_t))
+		SZ_compress_args_uint64_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_uint64_4D_MDQ(uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, uint64_t valueRangeSize, uint64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint64_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue;
+	int64_t diff = 0.0;
+	double itvNum = 0;
+	uint64_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (uint64_t*)malloc(r34*sizeof(uint64_t));
+	P1 = (uint64_t*)malloc(r34*sizeof(uint64_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint64_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressUInt64Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = (int64_t)(curValue) - (int64_t)(pred1D);
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressUInt64Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred1D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred1D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P1[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P1[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressUInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred1D);
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				P0[index2D] = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressUInt64Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred2D);
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					P0[index2D] = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt64Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = (int64_t)(spaceFillingValue[index]) - (int64_t)(pred3D);
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						P0[index2D] = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressUInt64Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			uint64_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT64);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_uint64_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint64_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, uint64_t valueRangeSize, uint64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint64_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(uint64_t))
+		SZ_compress_args_uint64_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_uint64_withinRange(unsigned char** newByteData, uint64_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char)*8);
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 8;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 8;
+	
+	uint64_t value = oriData[0];
+	int64ToBytes_bigEndian(tdps->exactDataBytes, value);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(uint64_t)+SZ_SIZE_TYPE; //8==3+1+4(uint64_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_uint64_wRngeNoGzip(unsigned char** newByteData, uint64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	uint64_t minValue = computeRangeSize_int(oriData, SZ_UINT64, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint64_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_uint64_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_uint64(unsigned char** newByteData, uint64_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	uint64_t minValue = (uint64_t)computeRangeSize_int(oriData, SZ_UINT64, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint64_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_uint64_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the uint64_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/thirdparty/SZ/sz/src/sz_uint8.c b/thirdparty/SZ/sz/src/sz_uint8.c
new file mode 100644
index 0000000000000000000000000000000000000000..6ca4ae48a3bd7a12f6ea1342ea7ffabc8a9b3f1d
--- /dev/null
+++ b/thirdparty/SZ/sz/src/sz_uint8.c
@@ -0,0 +1,1384 @@
+/**
+ *  @file sz_uint8.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief sz_uint8, Compression and Decompression functions
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+#include "sz.h"
+#include "CompressElement.h"
+#include "DynamicByteArray.h"
+#include "DynamicIntArray.h"
+#include "zlib.h"
+#include "rw.h"
+#include "TightDataPointStorageI.h"
+#include "sz_uint8.h"
+
+unsigned int optimize_intervals_uint8_1D(uint8_t *oriData, size_t dataLength, double realPrecision)
+{	
+	size_t i = 0, radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = dataLength/confparams_cpr->sampleDistance;
+	for(i=2;i<dataLength;i++)
+	{
+		if(i%confparams_cpr->sampleDistance==0)
+		{
+			//pred_value = 2*oriData[i-1] - oriData[i-2];
+			pred_value = oriData[i-1];
+			pred_err = llabs(pred_value - oriData[i]);
+			radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+			if(radiusIndex>=confparams_cpr->maxRangeRadius)
+				radiusIndex = confparams_cpr->maxRangeRadius - 1;			
+			intervals[radiusIndex]++;
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+		
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+	
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("accIntervals=%d, powerOf2=%d\n", accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint8_2D(uint8_t *oriData, size_t r1, size_t r2, double realPrecision)
+{	
+	size_t i,j, index;
+	size_t radiusIndex;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			if((i+j)%confparams_cpr->sampleDistance==0)
+			{
+				index = i*r2+j;
+				pred_value = oriData[index-1] + oriData[index-r2] - oriData[index-r2-1];
+				pred_err = llabs(pred_value - oriData[index]);
+				radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+				if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					radiusIndex = confparams_cpr->maxRangeRadius - 1;
+				intervals[radiusIndex]++;
+			}			
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	//printf("confparams_cpr->maxRangeRadius = %d, accIntervals=%d, powerOf2=%d\n", confparams_cpr->maxRangeRadius, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+unsigned int optimize_intervals_uint8_3D(uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision)
+{	
+	size_t i,j,k, index;
+	size_t radiusIndex;
+	size_t r23=r2*r3;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{			
+				if((i+j+k)%confparams_cpr->sampleDistance==0)
+				{
+					index = i*r23+j*r3+k;
+					pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r23] 
+					- oriData[index-1-r23] - oriData[index-r3-1] - oriData[index-r3-r23] + oriData[index-r3-r23-1];
+					pred_err = llabs(pred_value - oriData[index]);
+					radiusIndex = (pred_err/realPrecision+1)/2;
+					if(radiusIndex>=confparams_cpr->maxRangeRadius)
+					{
+						radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						//printf("radiusIndex=%d\n", radiusIndex);
+					}
+					intervals[radiusIndex]++;
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+	
+	free(intervals);
+	//printf("targetCount=%d, sum=%d, totalSampleSize=%d, ratio=%f, accIntervals=%d, powerOf2=%d\n", targetCount, sum, totalSampleSize, (double)sum/(double)totalSampleSize, accIntervals, powerOf2);
+	return powerOf2;
+}
+
+
+unsigned int optimize_intervals_uint8_4D(uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision)
+{
+	size_t i,j,k,l, index;
+	size_t radiusIndex;
+	size_t r234=r2*r3*r4;
+	size_t r34=r3*r4;
+	int64_t pred_value = 0, pred_err;
+	size_t *intervals = (size_t*)malloc(confparams_cpr->maxRangeRadius*sizeof(size_t));
+	memset(intervals, 0, confparams_cpr->maxRangeRadius*sizeof(size_t));
+	size_t totalSampleSize = (r1-1)*(r2-1)*(r3-1)*(r4-1)/confparams_cpr->sampleDistance;
+	for(i=1;i<r1;i++)
+	{
+		for(j=1;j<r2;j++)
+		{
+			for(k=1;k<r3;k++)
+			{
+				for (l=1;l<r4;l++)
+				{
+					if((i+j+k+l)%confparams_cpr->sampleDistance==0)
+					{
+						index = i*r234+j*r34+k*r4+l;
+						pred_value = oriData[index-1] + oriData[index-r3] + oriData[index-r34]
+								- oriData[index-1-r34] - oriData[index-r4-1] - oriData[index-r4-r34] + oriData[index-r4-r34-1];
+						pred_err = llabs(pred_value - oriData[index]);
+						radiusIndex = (uint64_t)((pred_err/realPrecision+1)/2);
+						if(radiusIndex>=confparams_cpr->maxRangeRadius)
+							radiusIndex = confparams_cpr->maxRangeRadius - 1;
+						intervals[radiusIndex]++;
+					}
+				}
+			}
+		}
+	}
+	//compute the appropriate number
+	size_t targetCount = totalSampleSize*confparams_cpr->predThreshold;
+	size_t sum = 0;
+	for(i=0;i<confparams_cpr->maxRangeRadius;i++)
+	{
+		sum += intervals[i];
+		if(sum>targetCount)
+			break;
+	}
+	if(i>=confparams_cpr->maxRangeRadius)
+		i = confparams_cpr->maxRangeRadius-1;
+
+	unsigned int accIntervals = 2*(i+1);
+	unsigned int powerOf2 = roundUpToPowerOf2(accIntervals);
+
+	if(powerOf2<32)
+		powerOf2 = 32;
+
+	free(intervals);
+	return powerOf2;
+}
+
+TightDataPointStorageI* SZ_compress_uint8_1D_MDQ(uint8_t *oriData, size_t dataLength, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+		quantization_intervals = optimize_intervals_uint8_1D(oriData, dataLength, realPrecision);
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	updateQuantizationInfo(quantization_intervals);	
+	size_t i;
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+		
+	uint8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);
+		
+	int64_t last3CmprsData[3] = {0,0,0};
+				
+	//add the first data	
+	type[0] = 0;
+	compressUInt8Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[0]);
+		
+	type[1] = 0;
+	compressUInt8Value(spaceFillingValue[1], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	listAdd_int(last3CmprsData, spaceFillingValue[1]);
+	//printf("%.30G\n",last3CmprsData[0]);	
+	
+	int state;
+	double checkRadius = (exe_params->intvCapacity-1)*realPrecision;
+	int64_t curData;
+	int64_t pred, predAbsErr;
+	double interval = 2*realPrecision;
+	
+	for(i=2;i<dataLength;i++)
+	{
+		curData = spaceFillingValue[i];
+		//pred = 2*last3CmprsData[0] - last3CmprsData[1];
+		pred = last3CmprsData[0];
+		predAbsErr = llabs(curData - pred);	
+		if(predAbsErr<=checkRadius)
+		{
+			state = (predAbsErr/realPrecision+1)/2;
+			if(curData>=pred)
+			{
+				type[i] = exe_params->intvRadius+state;
+				pred = pred + state*interval;
+			}
+			else //curData<pred
+			{
+				type[i] = exe_params->intvRadius-state;
+				pred = pred - state*interval;
+			}
+			if(pred>SZ_UINT8_MAX) pred = SZ_UINT8_MAX;
+			if(pred<SZ_UINT8_MIN) pred = SZ_UINT8_MIN;			
+			listAdd_int(last3CmprsData, pred);					
+			continue;
+		}
+		
+		//unpredictable data processing		
+		type[i] = 0;
+		compressUInt8Value(curData, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		listAdd_int(last3CmprsData, curData);
+	}//end of for
+		
+	size_t exactDataNum = exactDataByteArray->size / byteSize;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT8);
+
+//sdi:Debug
+/*	int sum =0;
+	for(i=0;i<dataLength;i++)
+		if(type[i]==0) sum++;
+	printf("opt_quantizations=%d, exactDataNum=%d, sum=%d\n",quantization_intervals, exactDataNum, sum);*/
+	
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;
+}
+
+void SZ_compress_args_uint8_StoreOriData(uint8_t* oriData, size_t dataLength, TightDataPointStorageI* tdps, 
+unsigned char** newByteData, size_t *outSize)
+{
+	int intSize=sizeof(uint8_t);	
+	size_t k = 0, i;
+	tdps->isLossless = 1;
+	size_t totalByteLength = 3 + MetaDataByteLength + exe_params->SZ_SIZE_TYPE + 1 + intSize*dataLength;
+	*newByteData = (unsigned char*)malloc(totalByteLength);
+	
+	unsigned char dsLengthBytes[8];
+	for (i = 0; i < 3; i++)//3
+		(*newByteData)[k++] = versionNumber[i];
+
+	if(exe_params->SZ_SIZE_TYPE==4)//1
+		(*newByteData)[k++] = 16; //00010000
+	else
+		(*newByteData)[k++] = 80;	//01010000: 01000000 indicates the SZ_SIZE_TYPE=8
+	
+	convertSZParamsToBytes(confparams_cpr, &((*newByteData)[k]));
+	k = k + MetaDataByteLength;		
+	
+	sizeToBytes(dsLengthBytes,dataLength); //SZ_SIZE_TYPE: 4 or 8	
+	for (i = 0; i < exe_params->SZ_SIZE_TYPE; i++)
+		(*newByteData)[k++] = dsLengthBytes[i];
+		
+	if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		memcpy((*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, oriData, dataLength*intSize);
+	else
+	{
+		unsigned char* p = (*newByteData)+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+		for(i=0;i<dataLength;i++,p+=intSize)
+			*p = oriData[i];
+	}	
+	*outSize = totalByteLength;
+}
+
+void SZ_compress_args_uint8_NoCkRngeNoGzip_1D(unsigned char** newByteData, uint8_t *oriData, 
+size_t dataLength, double realPrecision, size_t *outSize, int64_t valueRangeSize, uint8_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint8_1D_MDQ(oriData, dataLength, realPrecision, valueRangeSize, minValue);
+	//TODO: return bytes....
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+	if(*outSize > dataLength*sizeof(uint8_t))
+		SZ_compress_args_uint8_StoreOriData(oriData, dataLength+2, tdps, newByteData, outSize);
+	free_TightDataPointStorageI(tdps);
+}
+
+TightDataPointStorageI* SZ_compress_uint8_2D_MDQ(uint8_t *oriData, size_t r1, size_t r2, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint8_2D(oriData, r1, r2, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j; 
+	int64_t pred1D, pred2D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	uint8_t *P0, *P1;
+		
+	size_t dataLength = r1*r2;	
+	
+	P0 = (uint8_t*)malloc(r2*sizeof(uint8_t));
+	memset(P0, 0, r2*sizeof(uint8_t));
+	P1 = (uint8_t*)malloc(r2*sizeof(uint8_t));
+	memset(P1, 0, r2*sizeof(uint8_t));
+		
+	int* type = (int*) malloc(dataLength*sizeof(int));
+	//type[dataLength]=0;
+		
+	uint8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	curValue = P1[0] = spaceFillingValue[0];
+	compressUInt8Value(curValue, minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum =  llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_UINT8_MIN)
+			P1[1] = SZ_UINT8_MIN;
+		else
+			P1[1] = SZ_UINT8_MAX;
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt8Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r2-1 */
+	for (j = 2; j < r2; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				P1[j] = SZ_UINT8_MIN;
+			else
+				P1[j] = SZ_UINT8_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r1-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{	
+		/* Process row-i data 0 */
+		index = i*r2;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				P0[0] = SZ_UINT8_MIN;
+			else
+				P0[0] = SZ_UINT8_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+									
+		/* Process row-i data 1 --> r2-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P0[j] = SZ_UINT8_MIN;
+				else
+					P0[j] = SZ_UINT8_MAX;						
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		uint8_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	
+	if(r2!=1)
+		free(P0);
+	free(P1);			
+	
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT8);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+/**
+ * 
+ * Note: @r1 is high dimension
+ * 		 @r2 is low dimension 
+ * */
+void SZ_compress_args_uint8_NoCkRngeNoGzip_2D(unsigned char** newByteData, uint8_t *oriData, size_t r1, size_t r2, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, uint8_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint8_2D_MDQ(oriData, r1, r2, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2;
+	if(*outSize>dataLength*sizeof(uint8_t))
+		SZ_compress_args_uint8_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+TightDataPointStorageI* SZ_compress_uint8_3D_MDQ(uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint8_3D(oriData, r1, r2, r3, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	uint8_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3;		
+
+	size_t r23 = r2*r3;
+	P0 = (uint8_t*)malloc(r23*sizeof(uint8_t));
+	P1 = (uint8_t*)malloc(r23*sizeof(uint8_t));
+
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	type[0] = 0;
+	P1[0] = spaceFillingValue[0];
+	compressUInt8Value(spaceFillingValue[0], minValue, byteSize, bytes);
+	memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+	/* Process Row-0 data 1*/
+	pred1D = P1[0];
+	diff = spaceFillingValue[1] - pred1D;
+
+	itvNum = llabs(diff)/realPrecision + 1;
+
+	if (itvNum < exe_params->intvCapacity)
+	{
+		if (diff < 0) itvNum = -itvNum;
+		type[1] = (int) (itvNum/2) + exe_params->intvRadius;
+		tmp = pred1D + 2 * (type[1] - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+			P1[1] = tmp;
+		else if(tmp < SZ_UINT8_MIN)
+			P1[1] = SZ_UINT8_MIN;
+		else
+			P1[1] = SZ_UINT8_MAX;		
+	}
+	else
+	{
+		type[1] = 0;
+		curValue = P1[1] = spaceFillingValue[1];
+		compressUInt8Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+	}
+
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++)
+	{
+		pred1D = 2*P1[j-1] - P1[j-2];
+		diff = spaceFillingValue[j] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[j] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[j] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				P1[j] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				P1[j] = SZ_UINT8_MIN;
+			else
+				P1[j] = SZ_UINT8_MAX;			
+		}
+		else
+		{
+			type[j] = 0;
+			curValue = P1[j] = spaceFillingValue[j];
+			compressUInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+	}
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = P1[index-r3];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				P1[index] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				P1[index] = SZ_UINT8_MIN;
+			else
+				P1[index] = SZ_UINT8_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P1[index] = spaceFillingValue[index];
+			compressUInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = P1[index-1] + P1[index-r3] - P1[index-r3-1];
+
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P1[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P1[index] = SZ_UINT8_MIN;
+				else
+					P1[index] = SZ_UINT8_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P1[index] = spaceFillingValue[index];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+	}
+
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = P1[0];
+		diff = spaceFillingValue[index] - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				P0[0] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				P0[0] = SZ_UINT8_MIN;
+			else
+				P0[0] = SZ_UINT8_MAX;
+		}
+		else
+		{
+			type[index] = 0;
+			curValue = P0[0] = spaceFillingValue[index];
+			compressUInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = P0[j-1] + P1[j] - P1[j-1];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P0[j] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P0[j] = SZ_UINT8_MIN;
+				else
+					P0[j] = SZ_UINT8_MAX;				
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[j] = spaceFillingValue[index];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+	    /* Process Row-1 --> Row-r2-1 */
+		size_t index2D;
+		for (i = 1; i < r2; i++)
+		{
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			index2D = i*r3;		
+			pred2D = P0[index2D-r3] + P1[index2D] - P1[index2D-r3];
+			diff = spaceFillingValue[index] - pred2D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P0[index2D] = SZ_UINT8_MIN;
+				else
+					P0[index2D] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				type[index] = 0;
+				curValue = P0[index2D] = spaceFillingValue[index];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				index2D = i*r3 + j;
+				pred3D = P0[index2D-1] + P0[index2D-r3]+ P1[index2D] - P0[index2D-r3-1] - P1[index2D-r3] - P1[index2D-1] + P1[index2D-r3-1];
+				diff = spaceFillingValue[index] - pred3D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						P0[index2D] = SZ_UINT8_MIN;
+					else
+						P0[index2D] = SZ_UINT8_MAX;
+				}
+				else
+				{
+					type[index] = 0;
+					curValue = P0[index2D] = spaceFillingValue[index];
+					compressUInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+		uint8_t *Pt;
+		Pt = P1;
+		P1 = P0;
+		P0 = Pt;
+	}
+	if(r23!=1)
+		free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT8);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+
+void SZ_compress_args_uint8_NoCkRngeNoGzip_3D(unsigned char** newByteData, uint8_t *oriData, size_t r1, size_t r2, size_t r3, double realPrecision, size_t *outSize, 
+int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint8_3D_MDQ(oriData, r1, r2, r3, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3;
+	if(*outSize>dataLength*sizeof(uint8_t))
+		SZ_compress_args_uint8_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+	
+	free_TightDataPointStorageI(tdps);	
+}
+
+
+TightDataPointStorageI* SZ_compress_uint8_4D_MDQ(uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, int64_t valueRangeSize, int64_t minValue)
+{
+	unsigned char bytes[8] = {0,0,0,0,0,0,0,0};
+	int byteSize = computeByteSizePerIntValue(valueRangeSize);
+	
+	unsigned int quantization_intervals;
+	if(exe_params->optQuantMode==1)
+	{
+		quantization_intervals = optimize_intervals_uint8_4D(oriData, r1, r2, r3, r4, realPrecision);
+		updateQuantizationInfo(quantization_intervals);
+	}	
+	else
+		quantization_intervals = exe_params->intvCapacity;
+	size_t i,j,k; 
+	int64_t pred1D, pred2D, pred3D, curValue, tmp;
+	int diff = 0.0;
+	double itvNum = 0;
+	uint8_t *P0, *P1;
+		
+	size_t dataLength = r1*r2*r3*r4;		
+
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	P0 = (uint8_t*)malloc(r34*sizeof(uint8_t));
+	P1 = (uint8_t*)malloc(r34*sizeof(uint8_t));
+	
+	int* type = (int*) malloc(dataLength*sizeof(int));
+
+	uint8_t* spaceFillingValue = oriData; //
+	
+	DynamicByteArray *exactDataByteArray;
+	new_DBA(&exactDataByteArray, DynArrayInitLen);	
+
+	size_t l;
+	for (l = 0; l < r1; l++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		size_t index = l*r234;
+		size_t index2D = 0;
+
+		type[index] = 0;
+		curValue = P1[index2D] = spaceFillingValue[index];
+		compressUInt8Value(curValue, minValue, byteSize, bytes);
+		memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+
+		/* Process Row-0 data 1*/
+		index = l*r234+1;
+		index2D = 1;
+
+		pred1D = P1[index2D-1];
+		diff = curValue - pred1D;
+
+		itvNum = llabs(diff)/realPrecision + 1;
+
+		if (itvNum < exe_params->intvCapacity)
+		{
+			if (diff < 0) itvNum = -itvNum;
+			type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+			tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				P1[index2D] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				P1[index2D] = SZ_UINT8_MIN;
+			else
+				P1[index2D] = SZ_UINT8_MAX;			
+		}
+		else
+		{
+			type[index] = 0;
+
+			curValue = P1[index2D] = spaceFillingValue[0];
+			compressUInt8Value(curValue, minValue, byteSize, bytes);
+			memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+		}
+
+		/* Process Row-0 data 2 --> data r4-1 */
+		for (j = 2; j < r4; j++)
+		{
+			index = l*r234+j;
+			index2D = j;
+
+			pred1D = 2*P1[index2D-1] - P1[index2D-2];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P1[index2D] = SZ_UINT8_MIN;
+				else
+					P1[index2D] = SZ_UINT8_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (i = 1; i < r3; i++)
+		{
+			/* Process row-i data 0 */
+			index = l*r234+i*r4;
+			index2D = i*r4;
+
+			pred1D = P1[index2D-r4];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P1[index2D] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P1[index2D] = SZ_UINT8_MIN;
+				else
+					P1[index2D] = SZ_UINT8_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P1[index2D] = spaceFillingValue[0];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process row-i data 1 --> data r4-1*/
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+i*r4+j;
+				index2D = i*r4+j;
+
+				pred2D = P1[index2D-1] + P1[index2D-r4] - P1[index2D-r4-1];
+
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						P1[index2D] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						P1[index2D] = SZ_UINT8_MIN;
+					else
+						P1[index2D] = SZ_UINT8_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P1[index2D] = spaceFillingValue[0];
+					compressUInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+		}
+
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (k = 1; k < r2; k++)
+		{
+			/* Process Row-0 data 0*/
+			index = l*r234+k*r34;
+			index2D = 0;
+
+			pred1D = P1[index2D];
+			diff = spaceFillingValue[index] - pred1D;
+
+			itvNum = llabs(diff)/realPrecision + 1;
+
+			if (itvNum < exe_params->intvCapacity)
+			{
+				if (diff < 0) itvNum = -itvNum;
+				type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+				tmp = pred1D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					P0[index2D] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					P0[index2D] = SZ_UINT8_MIN;
+				else
+					P0[index2D] = SZ_UINT8_MAX;					
+			}
+			else
+			{
+				type[index] = 0;
+
+				curValue = P0[index2D] = spaceFillingValue[0];
+				compressUInt8Value(curValue, minValue, byteSize, bytes);
+				memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (j = 1; j < r4; j++)
+			{
+				index = l*r234+k*r34+j;
+				index2D = j;
+
+				pred2D = P0[index2D-1] + P1[index2D] - P1[index2D-1];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						P0[index2D] = SZ_UINT8_MIN;
+					else
+						P0[index2D] = SZ_UINT8_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (i = 1; i < r3; i++)
+			{
+				/* Process Row-i data 0 */
+				index = l*r234+k*r34+i*r4;
+				index2D = i*r4;
+
+				pred2D = P0[index2D-r4] + P1[index2D] - P1[index2D-r4];
+				diff = spaceFillingValue[index] - pred2D;
+
+				itvNum = llabs(diff)/realPrecision + 1;
+
+				if (itvNum < exe_params->intvCapacity)
+				{
+					if (diff < 0) itvNum = -itvNum;
+					type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+					tmp = pred2D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						P0[index2D] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						P0[index2D] = SZ_UINT8_MIN;
+					else
+						P0[index2D] = SZ_UINT8_MAX;						
+				}
+				else
+				{
+					type[index] = 0;
+
+					curValue = P0[index2D] = spaceFillingValue[0];
+					compressUInt8Value(curValue, minValue, byteSize, bytes);
+					memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (j = 1; j < r4; j++)
+				{
+					index = l*r234+k*r34+i*r4+j;
+					index2D = i*r4+j;
+
+					pred3D = P0[index2D-1] + P0[index2D-r4]+ P1[index2D] - P0[index2D-r4-1] - P1[index2D-r4] - P1[index2D-1] + P1[index2D-r4-1];
+					diff = spaceFillingValue[index] - pred3D;
+
+
+					itvNum = llabs(diff)/realPrecision + 1;
+
+					if (itvNum < exe_params->intvCapacity)
+					{
+						if (diff < 0) itvNum = -itvNum;
+						type[index] = (int) (itvNum/2) + exe_params->intvRadius;
+						tmp = pred3D + 2 * (type[index] - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+							P0[index2D] = tmp;
+						else if(tmp < SZ_UINT8_MIN)
+							P0[index2D] = SZ_UINT8_MIN;
+						else
+							P0[index2D] = SZ_UINT8_MAX;							
+					}
+					else
+					{
+						type[index] = 0;
+
+						curValue = P0[index2D] = spaceFillingValue[0];
+						compressUInt8Value(curValue, minValue, byteSize, bytes);
+						memcpyDBA_Data(exactDataByteArray, bytes, byteSize);
+					}
+				}
+			}
+
+			uint8_t *Pt;
+			Pt = P1;
+			P1 = P0;
+			P0 = Pt;
+		}
+	}
+
+	free(P0);
+	free(P1);
+
+	size_t exactDataNum = exactDataByteArray->size;
+	
+	TightDataPointStorageI* tdps;	
+			
+	new_TightDataPointStorageI(&tdps, dataLength, exactDataNum, byteSize, 
+			type, exactDataByteArray->array, exactDataByteArray->size,  
+			realPrecision, minValue, quantization_intervals, SZ_UINT8);
+			
+	//free memory
+	free(type);	
+	free(exactDataByteArray); //exactDataByteArray->array has been released in free_TightDataPointStorageF(tdps);
+	
+	return tdps;	
+}
+
+void SZ_compress_args_uint8_NoCkRngeNoGzip_4D(unsigned char** newByteData, uint8_t *oriData, size_t r1, size_t r2, size_t r3, size_t r4, double realPrecision, 
+size_t *outSize, int64_t valueRangeSize, int64_t minValue)
+{
+	TightDataPointStorageI* tdps = SZ_compress_uint8_4D_MDQ(oriData, r1, r2, r3, r4, realPrecision, valueRangeSize, minValue);
+
+	convertTDPStoFlatBytes_int(tdps, newByteData, outSize);
+
+	size_t dataLength = r1*r2*r3*r4;
+	if(*outSize>dataLength*sizeof(uint8_t))
+		SZ_compress_args_uint8_StoreOriData(oriData, dataLength, tdps, newByteData, outSize);
+
+	free_TightDataPointStorageI(tdps);
+}
+
+void SZ_compress_args_uint8_withinRange(unsigned char** newByteData, uint8_t *oriData, size_t dataLength, size_t *outSize)
+{
+	TightDataPointStorageI* tdps = (TightDataPointStorageI*) malloc(sizeof(TightDataPointStorageI));
+	tdps->typeArray = NULL;	
+	
+	tdps->allSameData = 1;
+	tdps->dataSeriesLength = dataLength;
+	tdps->exactDataBytes = (unsigned char*)malloc(sizeof(unsigned char));
+	tdps->isLossless = 0;
+	//tdps->exactByteSize = 4;
+	tdps->exactDataNum = 1;
+	tdps->exactDataBytes_size = 1;
+	
+	uint8_t value = oriData[0];
+	//intToBytes_bigEndian(tdps->exactDataBytes, value);
+	memcpy(tdps->exactDataBytes, &value, 1);
+	
+	size_t tmpOutSize;
+	convertTDPStoFlatBytes_int(tdps, newByteData, &tmpOutSize);
+
+	*outSize = tmpOutSize;//3+1+sizeof(uint8_t)+SZ_SIZE_TYPE; //8==3+1+4(uint8_size)
+	free_TightDataPointStorageI(tdps);	
+}
+
+int SZ_compress_args_uint8_wRngeNoGzip(unsigned char** newByteData, uint8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+	
+	uint8_t minValue = computeRangeSize_int(oriData, SZ_UINT8, dataLength, &valueRangeSize);
+	double realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+		
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint8_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+//		SZ_compress_args_uint8_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize);
+		if(r5==0&&r4==0&&r3==0&&r2==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_1D(newByteData, oriData, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0&&r3==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_2D(newByteData, oriData, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0&&r4==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_3D(newByteData, oriData, r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+		else if(r5==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_3D(newByteData, oriData, r4*r3, r2, r1, realPrecision, outSize, valueRangeSize, minValue);
+		}
+	}
+	return status;
+}
+
+int SZ_compress_args_uint8(unsigned char** newByteData, uint8_t *oriData, 
+size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, size_t *outSize, 
+int errBoundMode, double absErr_Bound, double relBoundRatio)
+{
+	confparams_cpr->errorBoundMode = errBoundMode;
+	
+	if(errBoundMode>=PW_REL)
+	{
+		printf("Error: Current SZ version doesn't support integer data compression with point-wise relative error bound being based on pwrType=AVG\n");
+		exit(0);
+		return SZ_NSCS;
+	}
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	int64_t valueRangeSize = 0;
+
+	uint8_t minValue = (uint8_t)computeRangeSize_int(oriData, SZ_UINT8, dataLength, &valueRangeSize);
+	double realPrecision = 0; 
+	
+	if(confparams_cpr->errorBoundMode==PSNR)
+	{
+		confparams_cpr->errorBoundMode = ABS;
+		realPrecision = confparams_cpr->absErrBound = computeABSErrBoundFromPSNR(confparams_cpr->psnr, (double)confparams_cpr->predThreshold, (double)valueRangeSize);
+		//printf("realPrecision=%lf\n", realPrecision);
+	}
+	else
+		realPrecision = getRealPrecision_int(valueRangeSize, errBoundMode, absErr_Bound, relBoundRatio, &status);
+
+	if(valueRangeSize <= realPrecision)
+	{
+		SZ_compress_args_uint8_withinRange(newByteData, oriData, dataLength, outSize);
+	}
+	else
+	{
+		size_t tmpOutSize = 0;
+		unsigned char* tmpByteData;
+		if (r2==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_1D(&tmpByteData, oriData, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r3==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_2D(&tmpByteData, oriData, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r4==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_3D(&tmpByteData, oriData, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		if (r5==0)
+		{
+			SZ_compress_args_uint8_NoCkRngeNoGzip_4D(&tmpByteData, oriData, r4, r3, r2, r1, realPrecision, &tmpOutSize, valueRangeSize, minValue);
+		}
+		else
+		{
+			printf("Error: doesn't support 5 dimensions for now.\n");
+			status = SZ_DERR; //dimension error
+		}
+		//Call Gzip to do the further compression.
+		if(confparams_cpr->szMode==SZ_BEST_SPEED)
+		{
+			*outSize = tmpOutSize;
+			*newByteData = tmpByteData;
+		}
+		else if(confparams_cpr->szMode==SZ_BEST_COMPRESSION || confparams_cpr->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			*outSize = zlib_compress5(tmpByteData, tmpOutSize, newByteData, confparams_cpr->gzipMode);
+			free(tmpByteData);
+		}
+		else
+		{
+			printf("Error: Wrong setting of confparams_cpr->szMode in the uint8_t compression.\n");
+			status = SZ_MERR; //mode error			
+		}
+	}
+	
+	return status;
+}
diff --git a/thirdparty/SZ/sz/src/szd_double.c b/thirdparty/SZ/sz/src/szd_double.c
new file mode 100644
index 0000000000000000000000000000000000000000..1440e2d012977829e6fc4c907e6ebb0ac3e24c48
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_double.c
@@ -0,0 +1,1875 @@
+/**
+ *  @file szd_double.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "szd_double.h"
+#include "TightDataPointStorageD.h"
+#include "sz.h"
+#include "Huffman.h"
+#include "szd_double_pwr.h"
+#include "szd_double_ts.h"
+
+int SZ_decompress_args_double(double** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<3; //i.e., *8
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 12+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;
+	if(cmpSize!=12+4+MetaDataByteLength && cmpSize!=12+8+MetaDataByteLength)
+	{
+		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
+		if(confparams_dec->szMode!=SZ_TEMPORAL_COMPRESSION)
+		{
+			if(isZlib)
+				confparams_dec->szMode = SZ_BEST_COMPRESSION;
+			else
+				confparams_dec->szMode = SZ_BEST_SPEED;			
+		}
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}	
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION || confparams_dec->szMode==SZ_TEMPORAL_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 			
+			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);			
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to double array.
+	TightDataPointStorageD* tdps;
+	int errBoundMode = new_TightDataPointStorageD_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+
+	int dim = computeDimension(r5,r4,r3,r2,r1);
+	int doubleSize = sizeof(double);
+	if(tdps->isLossless)
+	{
+		*newData = (double*)malloc(doubleSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*doubleSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=doubleSize)
+				(*newData)[i] = bytesToDouble(p);
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_double_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_double_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_double_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_double_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageD2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=12+MetaDataByteLength+exe_params->SZ_SIZE_TYPE)
+		free(szTmpBytes);	
+	return status;
+}
+
+void decompressDataSeries_double_1D(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+	unsigned char* leadNum;
+	double interval = tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+	
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+	
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	double medianValue, exactData, predValue;
+	
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+	
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToDouble(curBytes);
+			(*data)[i] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(double));
+#endif	
+	
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_double_2D(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	unsigned char* leadNum;
+	double realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	double medianValue, exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+
+	double pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 8);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToDouble(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,8);
+
+	/* Process Row-0, data 1 */
+	type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+		
+		exactData = bytesToDouble(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,8);
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];			
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+	}
+
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(double));
+#endif	
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_double_3D(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+
+	unsigned char* leadNum;
+	double realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits;
+	unsigned char leadingNum;
+	double medianValue, exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+
+	double pred1D, pred2D, pred3D;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 8);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToDouble(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,8);
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToDouble(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,8);
+	}
+
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 8);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToDouble(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,8);
+				}
+			}
+		}
+	}
+
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(double));
+#endif	
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_double_4D(double** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageD* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+//	printf ("%d %d %d\n", r1, r2, r3, r4);
+
+	unsigned char* leadNum;
+	double realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits;
+	unsigned char leadingNum;
+	double medianValue, exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+
+	double pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToDouble(curBytes);
+		(*data)[index] = exactData + medianValue;
+		memcpy(preBytes,curBytes,8);
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 8);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToDouble(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,8);
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 8);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToDouble(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,8);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 8);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToDouble(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,8);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						// compute resiBits
+						resiBits = 0;
+						if (resiBitsLength != 0) {
+							int kMod8 = k % 8;
+							int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+							if (rightMovSteps > 0) {
+								int code = getRightMovingCode(kMod8, resiBitsLength);
+								resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+							} else if (rightMovSteps < 0) {
+								int code1 = getLeftMovingCode(kMod8);
+								int code2 = getRightMovingCode(kMod8, resiBitsLength);
+								int leftMovSteps = -rightMovSteps;
+								rightMovSteps = 8 - leftMovSteps;
+								resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+								p++;
+								resiBits = resiBits
+										| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+							} else // rightMovSteps == 0
+							{
+								int code = getRightMovingCode(kMod8, resiBitsLength);
+								resiBits = (tdps->residualMidBits[p] & code);
+								p++;
+							}
+							k += resiBitsLength;
+						}
+
+						// recover the exact data
+						memset(curBytes, 0, 8);
+						leadingNum = leadNum[l++];
+						memcpy(curBytes, preBytes, leadingNum);
+						for (j = leadingNum; j < reqBytesLength; j++)
+							curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+						if (resiBitsLength != 0) {
+							unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+							curBytes[reqBytesLength] = resiByte;
+						}
+
+						exactData = bytesToDouble(curBytes);
+						(*data)[index] = exactData + medianValue;
+						memcpy(preBytes,curBytes,8);
+					}
+				}
+			}
+		}
+	}
+
+//I didn't implement time-based compression for 4D actually. 
+//#ifdef HAVE_TIMECMPR	
+//	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+//		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(double));
+//#endif	
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void getSnapshotData_double_1D(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps, int errBoundMode) 
+{
+	size_t i;
+	if (tdps->allSameData) {
+		double value = bytesToDouble(tdps->exactMidBytes);
+		*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR				
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(multisteps->compressionType == 0) //snapshot
+						decompressDataSeries_double_1D(data, dataSeriesLength, tdps);
+					else
+						decompressDataSeries_double_1D_ts(data, dataSeriesLength, multisteps, tdps);					
+				}
+				else
+#endif								
+					decompressDataSeries_double_1D(data, dataSeriesLength, tdps);
+			}
+			else 
+			{
+				//decompressDataSeries_double_1D_pwr(data, dataSeriesLength, tdps);
+				decompressDataSeries_double_1D_pwrgroup(data, dataSeriesLength, tdps);
+			}
+			return;
+		} else {
+			*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+			// insert the reserved values
+			//int[] rtypes = TypeManager.convertByteArray2IntArray_fast_1b(
+			//		dataSeriesLength, rtypeArray);
+			int* rtypes;
+			int validLength = computeBitNumRequired(dataSeriesLength);
+			decompressBitArraybySimpleLZ77(&rtypes, tdps->rtypeArray, tdps->rtypeArray_size, dataSeriesLength, validLength);
+			size_t count = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 1)
+					(*data)[i] = tdps->reservedValue;
+				else
+					count++;
+			}
+			// get the decompressed data
+			double* decmpData;
+			if(errBoundMode < PW_REL)
+				decompressDataSeries_double_1D(&decmpData, dataSeriesLength, tdps);
+			else 
+				decompressDataSeries_double_1D_pwr(&decmpData, dataSeriesLength, tdps);
+			// insert the decompressed data
+			size_t k = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 0) {
+					(*data)[i] = decmpData[k++];
+				}
+			}
+			free(decmpData);
+			free(rtypes);
+		}
+	}
+}
+
+void getSnapshotData_double_2D(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		double value = bytesToDouble(tdps->exactMidBytes);
+		*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR				
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(multisteps->compressionType == 0) //snapshot
+						decompressDataSeries_double_2D(data, r1, r2, tdps);
+					else
+						decompressDataSeries_double_1D_ts(data, dataSeriesLength, multisteps, tdps);					
+				}
+				else
+#endif						
+					decompressDataSeries_double_2D(data, r1, r2, tdps);
+			}
+			else 
+				decompressDataSeries_double_2D_pwr(data, r1, r2, tdps);
+			return;
+		} else {
+			*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+			// insert the reserved values
+			//int[] rtypes = TypeManager.convertByteArray2IntArray_fast_1b(
+			//		dataSeriesLength, rtypeArray);
+			int* rtypes;
+			int validLength = computeBitNumRequired(dataSeriesLength);
+			decompressBitArraybySimpleLZ77(&rtypes, tdps->rtypeArray, tdps->rtypeArray_size, dataSeriesLength, validLength);
+			size_t count = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 1)
+					(*data)[i] = tdps->reservedValue;
+				else
+					count++;
+			}
+			// get the decompressed data
+			double* decmpData;
+			if(errBoundMode < PW_REL)
+				decompressDataSeries_double_2D(&decmpData, r1, r2, tdps);
+			else 
+				decompressDataSeries_double_2D_pwr(&decmpData, r1, r2, tdps);
+			// insert the decompressed data
+			size_t k = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 0) {
+					(*data)[i] = decmpData[k++];
+				}
+			}
+			free(decmpData);
+			free(rtypes);
+		}
+	}
+}
+
+void getSnapshotData_double_3D(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		double value = bytesToDouble(tdps->exactMidBytes);
+		*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR				
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(multisteps->compressionType == 0) //snapshot
+						decompressDataSeries_double_3D(data, r1, r2, r3, tdps);
+					else
+						decompressDataSeries_double_1D_ts(data, dataSeriesLength, multisteps, tdps);					
+				}
+				else
+#endif						
+					decompressDataSeries_double_3D(data, r1, r2, r3, tdps);
+			}
+			else 
+				decompressDataSeries_double_3D_pwr(data, r1, r2, r3, tdps);
+			return;
+		} else {
+			*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+			// insert the reserved values
+			//int[] rtypes = TypeManager.convertByteArray2IntArray_fast_1b(
+			//		dataSeriesLength, rtypeArray);
+			int* rtypes;
+			int validLength = computeBitNumRequired(dataSeriesLength);
+			decompressBitArraybySimpleLZ77(&rtypes, tdps->rtypeArray, tdps->rtypeArray_size, dataSeriesLength, validLength);
+			size_t count = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 1)
+					(*data)[i] = tdps->reservedValue;
+				else
+					count++;
+			}
+			// get the decompressed data
+			double* decmpData;
+			if(errBoundMode < PW_REL)
+				decompressDataSeries_double_3D(&decmpData, r1, r2, r3, tdps);
+			else 
+				decompressDataSeries_double_3D_pwr(&decmpData, r1, r2, r3, tdps);			
+			// insert the decompressed data
+			size_t k = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 0) {
+					(*data)[i] = decmpData[k++];
+				}
+			}
+			free(decmpData);
+			free(rtypes);
+		}
+	}
+}
+
+void getSnapshotData_double_4D(double** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageD* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		double value = bytesToDouble(tdps->exactMidBytes);
+		*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR					
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(multisteps->compressionType == 0)
+						decompressDataSeries_double_4D(data, r1, r2, r3, r4, tdps);
+					else
+						decompressDataSeries_double_1D_ts(data, r1*r2*r3*r4, multisteps, tdps);					
+				}
+				else
+#endif				
+					decompressDataSeries_double_4D(data, r1, r2, r3, r4, tdps);
+			}
+			else 
+			{
+				decompressDataSeries_double_3D_pwr(data, r1*r2, r3, r4, tdps);
+				//ToDO
+				//decompressDataSeries_double_4D_pwr(data, r1, r2, r3, r4, tdps);
+			}					
+			return;
+		} else {
+			*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+			int* rtypes;
+			int validLength = computeBitNumRequired(dataSeriesLength);
+			decompressBitArraybySimpleLZ77(&rtypes, tdps->rtypeArray, tdps->rtypeArray_size, dataSeriesLength, validLength);
+			size_t count = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 1)
+					(*data)[i] = tdps->reservedValue;
+				else
+					count++;
+			}
+			// get the decompressed data
+			double* decmpData;
+			if(errBoundMode < PW_REL)
+				decompressDataSeries_double_4D(&decmpData, r1, r2, r3, r4, tdps);
+			else
+				decompressDataSeries_double_3D_pwr(&decmpData, r1*r2, r3, r4, tdps);
+				//ToDo
+				//decompressDataSeries_double_4D_pwr(&decmpData, r1, r2, r3, r4, tdps);
+			// insert the decompressed data
+			size_t k = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 0) {
+					(*data)[i] = decmpData[k++];
+				}
+			}
+			free(decmpData);
+			free(rtypes);
+		}
+	}
+}
diff --git a/thirdparty/SZ/sz/src/szd_double_pwr.c b/thirdparty/SZ/sz/src/szd_double_pwr.c
new file mode 100644
index 0000000000000000000000000000000000000000..f4a6fd861df7f8a1fd5fa231a009a3dda69114b4
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_double_pwr.c
@@ -0,0 +1,1350 @@
+/**
+ *  @file szd_double_pwr.c
+ *  @author Sheng Di
+ *  @date May, 2016
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "TightDataPointStorageD.h"
+#include "sz.h"
+#include "Huffman.h"
+//#include "rw.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wchar-subscripts"
+
+void decompressDataSeries_double_1D_pwr(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	unsigned char tmpPrecBytes[8] = {0}; //used when needing to convert bytes to double values
+	unsigned char* bp = tdps->pwrErrBoundBytes;
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+
+	unsigned char* leadNum;
+	double interval = 0;// = (double)tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+	
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+	
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqLength = 0, reqBytesLength = 0, resiBitsLength = 0, resiBits = 0; 
+	unsigned char leadingNum;	
+	double medianValue, exactData, predValue = 0, realPrecision = 0;
+	
+	medianValue = tdps->medianValue;
+	
+	int type_, updateReqLength = 0;
+	for (i = 0; i < dataSeriesLength; i++) 
+	{
+		if(i%tdps->segment_size==0)
+		{
+			tmpPrecBytes[0] = *(bp++);
+			tmpPrecBytes[1] = *(bp++);
+			memset(&tmpPrecBytes[2], 0, 6*sizeof(unsigned char));
+
+			realPrecision = bytesToDouble(tmpPrecBytes);
+			interval = realPrecision*2;
+			updateReqLength = 0;
+		}
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;	
+				updateReqLength = 1;	
+			}
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToDouble(curBytes);
+			(*data)[i] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(leadNum);
+	free(type);
+	return;
+}
+
+double* extractRealPrecision_2D_double(size_t R1, size_t R2, int blockSize, TightDataPointStorageD* tdps)
+{
+	size_t i,j,k=0, I;
+	unsigned char* bytes = tdps->pwrErrBoundBytes;
+	unsigned char tmpBytes[8] = {0};
+	double* result = (double*)malloc(sizeof(double)*R1*R2);
+	for(i=0;i<R1;i++)
+	{
+		I = i*R2;
+		for(j=0;j<R2;j++)
+		{
+			tmpBytes[0] = bytes[k++];
+			tmpBytes[1] = bytes[k++];
+			result[I+j]=bytesToDouble(tmpBytes);
+		}
+	}
+	return result;
+}
+
+void decompressDataSeries_double_2D_pwr(double** data, size_t r1, size_t r2, TightDataPointStorageD* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	unsigned char* leadNum;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqLength, reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	double medianValue, exactData, realPrecision;
+	int type_;
+	double pred1D, pred2D;
+	size_t ii, jj, II = 0, JJ = 0, updateReqLength = 1;
+
+	int blockSize = computeBlockEdgeSize_2D(tdps->segment_size);
+	size_t R1 = 1+(r1-1)/blockSize;
+	size_t R2 = 1+(r2-1)/blockSize;		
+	double* pwrErrBound = extractRealPrecision_2D_double(R1, R2, blockSize, tdps);
+
+	realPrecision = pwrErrBound[0];	
+	computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+
+	/* Process Row-0, data 0 */
+
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 8);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToDouble(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,8);
+
+	/* Process Row-0, data 1 */
+	type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];		
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+		
+		exactData = bytesToDouble(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,8);
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		if(jj%blockSize==0)
+		{
+			II = 0;
+			JJ++;
+			realPrecision = pwrErrBound[JJ];
+			updateReqLength = 0;			
+		}		
+		
+		type_ = type[jj];
+		if (type_ != 0)
+		{			
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}			
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		if(ii%blockSize==0)
+			II++;
+		JJ = 0;
+		realPrecision = pwrErrBound[II*R2+JJ];				
+		updateReqLength = 0;
+
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+			
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+
+			if(jj%blockSize==0)
+				JJ++;
+			realPrecision = pwrErrBound[II*R2+JJ];			
+			updateReqLength = 0;
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}						
+				
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+	}
+
+	free(pwrErrBound);
+	free(leadNum);
+	free(type);
+	return;
+}
+
+double* extractRealPrecision_3D_double(size_t R1, size_t R2, size_t R3, int blockSize, TightDataPointStorageD* tdps)
+{
+	size_t i,j,k=0, IR, JR, p = 0;
+	size_t R23 = R2*R3;
+	unsigned char* bytes = tdps->pwrErrBoundBytes;
+	unsigned char tmpBytes[4] = {0};
+	double* result = (double*)malloc(sizeof(double)*R1*R2*R3);
+	for(i=0;i<R1;i++)
+	{
+		IR = i*R23;
+		for(j=0;j<R2;j++)
+		{
+			JR = j*R3;
+			for(k=0;k<R3;k++)
+			{
+				tmpBytes[0] = bytes[p++];
+				tmpBytes[1] = bytes[p++];
+				result[IR+JR+k]=bytesToDouble(tmpBytes);				
+			}
+		}
+	}
+	return result;
+}
+
+void decompressDataSeries_double_3D_pwr(double** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageD* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+
+	unsigned char* leadNum;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqLength, reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;
+	double medianValue, exactData, realPrecision;
+	int type_;
+	double pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, II = 0, JJ = 0, KK = 0, updateReqLength = 1;
+
+	int blockSize = computeBlockEdgeSize_3D(tdps->segment_size);
+	size_t R1 = 1+(r1-1)/blockSize;
+	size_t R2 = 1+(r2-1)/blockSize;		
+	size_t R3 = 1+(r3-1)/blockSize;
+	size_t R23 = R2*R3;
+	double* pwrErrBound = extractRealPrecision_3D_double(R1, R2, R3, blockSize, tdps);
+
+	realPrecision = pwrErrBound[0];	
+	computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 8);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToDouble(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,8);
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 8);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToDouble(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,8);
+	}
+
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		if(jj%blockSize==0)
+		{
+			KK = 0;//dimension 1 (top)
+			II = 0;//dimension 2 (mid)
+			JJ++;
+			realPrecision = pwrErrBound[JJ];
+			updateReqLength = 0;			
+		}		
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */		
+		if(ii%blockSize==0)
+			II++;		
+		JJ = 0;
+		realPrecision = pwrErrBound[II*R3+JJ];
+		updateReqLength = 0;		
+
+		index = ii*r3;
+		
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r3];			
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+			
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+
+			if(jj%blockSize==0)
+				JJ++;
+			realPrecision = pwrErrBound[II*R3+JJ];			
+			updateReqLength = 0;			
+			
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];				
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}
+
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;		
+		if(kk%blockSize==0)
+			KK++;
+		II = 0;
+		JJ = 0;
+
+		realPrecision = pwrErrBound[KK*R23];			
+		updateReqLength = 0;			
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r23];			
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToDouble(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+
+			if(jj%blockSize==0)
+				JJ++;
+
+			realPrecision = pwrErrBound[KK*R23+JJ];			
+			updateReqLength = 0;			
+			
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];			
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}
+			
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			
+			if(ii%blockSize==0)
+				II++;
+			JJ = 0;
+			
+			realPrecision = pwrErrBound[KK*R23+II*R3];			
+			updateReqLength = 0;						
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];				
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}
+
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 8);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToDouble(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,8);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				if(jj%blockSize==0)
+					JJ++;
+
+				realPrecision = pwrErrBound[KK*R23+II*R3+JJ];			
+				updateReqLength = 0;				
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];					
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					if(updateReqLength==0)
+					{
+						computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+						reqBytesLength = reqLength/8;
+						resiBitsLength = reqLength%8;				
+						updateReqLength = 1;
+					}
+				
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 8);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToDouble(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,8);
+				}
+			}
+		}
+	}
+
+	free(pwrErrBound);
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_double_1D_pwrgroup(double** data, size_t dataSeriesLength, TightDataPointStorageD* tdps) 
+{
+	double *posGroups, *negGroups, *groups;
+	double pos_01_group, neg_01_group;
+	int *posFlags, *negFlags;
+	
+	updateQuantizationInfo(tdps->intervals);
+	
+	unsigned char* leadNum;
+	double interval;// = (double)tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	createRangeGroups_double(&posGroups, &negGroups, &posFlags, &negFlags);
+	
+	double realGroupPrecision;
+	double realPrecision = tdps->realPrecision;
+	char* groupID = decompressGroupIDArray(tdps->pwrErrBoundBytes, tdps->dataSeriesLength);
+	
+	//note that the groupID values here are [1,2,3,....,18] or [-1,-2,...,-18]
+	
+	double* groupErrorBounds = generateGroupErrBounds(confparams_dec->errorBoundMode, realPrecision, confparams_dec->pw_relBoundRatio);
+	exe_params->intvRadius = generateGroupMaxIntervalCount(groupErrorBounds);
+		
+	size_t nbBins = (size_t)(1/confparams_dec->pw_relBoundRatio + 0.5);
+	if(nbBins%2==1)
+		nbBins++;
+	exe_params->intvRadius = nbBins;
+
+	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+	
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqLength, reqBytesLength = 0, resiBitsLength = 0, resiBits; 
+	unsigned char leadingNum;	
+	double medianValue, exactData, curValue, predValue;
+	
+	medianValue = tdps->medianValue;
+	
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+							// in resiMidBits, p is to track the
+							// byte_index of resiMidBits, l is for
+							// leadNum
+							
+	int type_, updateReqLength = 0;
+	char rawGrpID = 0, indexGrpID = 0;
+	for (i = 0; i < dataSeriesLength; i++) 
+	{
+		rawGrpID = groupID[i];
+		
+		if(rawGrpID >= 2)
+		{
+			groups = posGroups;
+			indexGrpID = rawGrpID - 2;
+		}
+		else if(rawGrpID <= -2)
+		{
+			groups = negGroups;
+			indexGrpID = -rawGrpID - 2;		}
+		else if(rawGrpID == 1)
+		{
+			groups = &pos_01_group;
+			indexGrpID = 0;
+		}
+		else //rawGrpID == -1
+		{
+			groups = &neg_01_group;
+			indexGrpID = 0;			
+		}
+		
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_double(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;	
+				updateReqLength = 1;	
+			}
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data	
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToDouble(curBytes);
+			exactData = exactData + medianValue;
+			(*data)[i] = exactData;
+			memcpy(preBytes,curBytes,8);
+			
+			groups[indexGrpID] = exactData;
+			
+			break;
+		default:
+			predValue = groups[indexGrpID]; //Here, groups[indexGrpID] is the previous value.
+			realGroupPrecision = groupErrorBounds[indexGrpID];
+			interval = realGroupPrecision*2;		
+			
+			curValue = predValue + (type_-exe_params->intvRadius)*interval;
+			
+			//groupNum = computeGroupNum_double(curValue);
+			
+			if((curValue>0&&rawGrpID<0)||(curValue<0&&rawGrpID>0))
+				curValue = 0;
+			//else
+			//{
+			//	realGrpID = fabs(rawGrpID)-2;
+			//	if(groupNum<realGrpID)
+			//		curValue = rawGrpID>0?pow(2,realGrpID):-pow(2,realGrpID);
+			//	else if(groupNum>realGrpID)
+			//		curValue = rawGrpID>0?pow(2,groupNum):-pow(2,groupNum);				
+			//}	
+				
+			(*data)[i] = curValue;
+			groups[indexGrpID] = curValue;
+			break;		
+		}
+	}	
+	
+	free(leadNum);
+	free(type);
+	
+	free(posGroups);
+	free(negGroups);
+	free(posFlags);
+	free(negFlags);
+	free(groupErrorBounds);
+	free(groupID);
+}
+#pragma GCC diagnostic pop
diff --git a/thirdparty/SZ/sz/src/szd_double_ts.c b/thirdparty/SZ/sz/src/szd_double_ts.c
new file mode 100644
index 0000000000000000000000000000000000000000..50c8c495951bee0398d2eb63a7f4bee31cb91bee
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_double_ts.c
@@ -0,0 +1,114 @@
+/**
+ *  @file szd_double_ts.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "szd_double.h"
+#include "TightDataPointStorageD.h"
+#include "sz.h"
+#include "Huffman.h"
+#include "szd_double_ts.h"
+
+void decompressDataSeries_double_1D_ts(double** data, size_t dataSeriesLength, sz_multisteps* multisteps, TightDataPointStorageD* tdps) 
+{
+	double* lastSnapshotData = (double*)multisteps->hist_data;
+	updateQuantizationInfo(tdps->intervals);
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+	unsigned char* leadNum;
+	double interval = tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+	*data = (double*)malloc(sizeof(double)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+	
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[8];
+	unsigned char curBytes[8];
+	
+	memset(preBytes, 0, 8);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	double medianValue, exactData, predValue = 0;
+	
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+	
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 8);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToDouble(curBytes);
+			(*data)[i] = exactData + medianValue;
+			memcpy(preBytes,curBytes,8);
+			break;
+		default:
+			//predValue = (*data)[i-1];
+			if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				predValue = lastSnapshotData[i];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	
+	memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(double));
+	
+	free(leadNum);
+	free(type);
+	return;
+}
diff --git a/thirdparty/SZ/sz/src/szd_float.c b/thirdparty/SZ/sz/src/szd_float.c
new file mode 100644
index 0000000000000000000000000000000000000000..5a420afeeec036848fe70ba58c3db3a150c3d657
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_float.c
@@ -0,0 +1,2179 @@
+/**
+ *  @file szd_float.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "szd_float.h"
+#include "TightDataPointStorageF.h"
+#include "sz.h"
+#include "Huffman.h"
+#include "szd_float_pwr.h"
+#include "szd_float_ts.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_float(float** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 8+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+	
+	if(cmpSize!=8+4+MetaDataByteLength && cmpSize!=8+8+MetaDataByteLength) //4,8 means two posibilities of SZ_SIZE_TYPE
+	{
+		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
+		if(confparams_dec->szMode!=SZ_TEMPORAL_COMPRESSION)
+		{
+			if(isZlib)
+				confparams_dec->szMode = SZ_BEST_COMPRESSION;
+			else
+				confparams_dec->szMode = SZ_BEST_SPEED;			
+		}
+		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION || confparams_dec->szMode==SZ_TEMPORAL_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageF* tdps;
+	int errBoundMode = new_TightDataPointStorageF_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int floatSize = sizeof(float);
+	if(tdps->isLossless)
+	{
+		*newData = (float*)malloc(floatSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*floatSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=floatSize)
+				(*newData)[i] = bytesToFloat(p);
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_float_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_float_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_float_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_float_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageF2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=8+MetaDataByteLength+exe_params->SZ_SIZE_TYPE)
+		free(szTmpBytes);
+	return status;
+}
+
+void decompressDataSeries_float_1D(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+	unsigned char* leadNum;
+	double interval = tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+	
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+	
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	float medianValue, exactData, predValue;
+	
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+	
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {	
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data	
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToFloat(curBytes);
+			(*data)[i] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(float));
+#endif	
+	
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_float_2D(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	unsigned char* leadNum;
+	double realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	float medianValue, exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+	
+	float pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 4);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToFloat(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,4);
+
+	/* Process Row-0, data 1 */
+	type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToFloat(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,4);
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+	}
+
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(float));
+#endif	
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_float_3D(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+	unsigned char* leadNum;
+	double realPrecision = tdps->realPrecision;
+
+	//TODO
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+
+	memset(preBytes, 0, 4);
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits;
+	unsigned char leadingNum;
+	float medianValue, exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+	
+	float pred1D, pred2D, pred3D;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 4);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+	exactData = bytesToFloat(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,4);
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToFloat(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,4);
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 4);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToFloat(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,4);
+				}
+			}
+		}
+	}
+	
+#ifdef HAVE_TIMECMPR	
+	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(float));
+#endif		
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_float_4D(float** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageF* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+//	printf ("%d %d %d %d\n", r1, r2, r3, r4);
+	unsigned char* leadNum;
+	double realPrecision = tdps->realPrecision;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+
+	memset(preBytes, 0, 4);
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits;
+	unsigned char leadingNum;
+	float medianValue, exactData;
+	int type_;
+
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+
+	float pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+		exactData = bytesToFloat(curBytes);
+		(*data)[index] = exactData + medianValue;
+		memcpy(preBytes,curBytes,4);
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 4);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToFloat(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,4);
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 4);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToFloat(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,4);
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 4);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+
+					exactData = bytesToFloat(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,4);
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						// compute resiBits
+						resiBits = 0;
+						if (resiBitsLength != 0) {
+							int kMod8 = k % 8;
+							int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+							if (rightMovSteps > 0) {
+								int code = getRightMovingCode(kMod8, resiBitsLength);
+								resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+							} else if (rightMovSteps < 0) {
+								int code1 = getLeftMovingCode(kMod8);
+								int code2 = getRightMovingCode(kMod8, resiBitsLength);
+								int leftMovSteps = -rightMovSteps;
+								rightMovSteps = 8 - leftMovSteps;
+								resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+								p++;
+								resiBits = resiBits
+										| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+							} else // rightMovSteps == 0
+							{
+								int code = getRightMovingCode(kMod8, resiBitsLength);
+								resiBits = (tdps->residualMidBits[p] & code);
+								p++;
+							}
+							k += resiBitsLength;
+						}
+
+						// recover the exact data
+						memset(curBytes, 0, 4);
+						leadingNum = leadNum[l++];
+						memcpy(curBytes, preBytes, leadingNum);
+						for (j = leadingNum; j < reqBytesLength; j++)
+							curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+						if (resiBitsLength != 0) {
+							unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+							curBytes[reqBytesLength] = resiByte;
+						}
+
+						exactData = bytesToFloat(curBytes);
+						(*data)[index] = exactData + medianValue;
+						memcpy(preBytes,curBytes,4);
+					}
+				}
+			}
+
+		}
+	}
+
+//I didn't implement time-based compression for 4D actually. 
+//#ifdef HAVE_TIMECMPR	
+//	if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+//		memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(float));
+//#endif	
+
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void getSnapshotData_float_1D(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps, int errBoundMode)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		float value = bytesToFloat(tdps->exactMidBytes);
+		*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR				
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(multisteps->compressionType == 0) //snapshot
+						decompressDataSeries_float_1D(data, dataSeriesLength, tdps);
+					else
+						decompressDataSeries_float_1D_ts(data, dataSeriesLength, multisteps, tdps);					
+				}
+				else
+#endif				
+					decompressDataSeries_float_1D(data, dataSeriesLength, tdps);
+			}
+			else 
+			{
+				//decompressDataSeries_float_1D_pwr(data, dataSeriesLength, tdps);
+				decompressDataSeries_float_1D_pwrgroup(data, dataSeriesLength, tdps);
+			}
+			return;
+		} else {
+			*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+			// insert the reserved values
+			//int[] rtypes = TypeManager.convertByteArray2IntArray_fast_1b(
+			//		dataSeriesLength, rtypeArray);
+			int* rtypes;
+			int validLength = computeBitNumRequired(dataSeriesLength);
+			decompressBitArraybySimpleLZ77(&rtypes, tdps->rtypeArray, tdps->rtypeArray_size, dataSeriesLength, validLength);
+			size_t count = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 1)
+					(*data)[i] = tdps->reservedValue;
+				else
+					count++;
+			}
+			// get the decompressed data
+			float* decmpData;
+			if(errBoundMode < PW_REL)
+				decompressDataSeries_float_1D(&decmpData, dataSeriesLength, tdps);
+			else 
+				decompressDataSeries_float_1D_pwr(&decmpData, dataSeriesLength, tdps);
+			// insert the decompressed data
+			size_t k = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 0) {
+					(*data)[i] = decmpData[k++];
+				}
+			}
+			free(decmpData);
+			free(rtypes);
+		}
+	}
+}
+
+void getSnapshotData_float_2D(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		float value = bytesToFloat(tdps->exactMidBytes);
+		*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR					
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(multisteps->compressionType == 0)
+						decompressDataSeries_float_2D(data, r1, r2, tdps);
+					else
+						decompressDataSeries_float_1D_ts(data, r1*r2, multisteps, tdps);					
+				}
+				else
+#endif
+					decompressDataSeries_float_2D(data, r1, r2, tdps);
+			}
+			else 
+			{
+				decompressDataSeries_float_2D_pwr(data, r1, r2, tdps);
+			}			
+
+			return;
+		} else {
+			*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+			// insert the reserved values
+			//int[] rtypes = TypeManager.convertByteArray2IntArray_fast_1b(
+			//		dataSeriesLength, rtypeArray);
+			int* rtypes;
+			int validLength = computeBitNumRequired(dataSeriesLength);
+			decompressBitArraybySimpleLZ77(&rtypes, tdps->rtypeArray, tdps->rtypeArray_size, dataSeriesLength, validLength);
+			size_t count = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 1)
+					(*data)[i] = tdps->reservedValue;
+				else
+					count++;
+			}
+			// get the decompressed data
+			float* decmpData;
+			if(errBoundMode < PW_REL)
+				decompressDataSeries_float_2D(&decmpData, r1, r2, tdps);
+			else 
+				decompressDataSeries_float_2D_pwr(&decmpData, r1, r2, tdps);
+			// insert the decompressed data
+			size_t k = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 0) {
+					(*data)[i] = decmpData[k++];
+				}
+			}
+			free(decmpData);
+			free(rtypes);
+		}
+	}
+}
+
+void getSnapshotData_float_3D(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		float value = bytesToFloat(tdps->exactMidBytes);
+		*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR					
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(multisteps->compressionType == 0)
+						decompressDataSeries_float_3D(data, r1, r2, r3, tdps);
+					else
+						decompressDataSeries_float_1D_ts(data, r1*r2*r3, multisteps, tdps);					
+				}
+				else
+#endif				
+					decompressDataSeries_float_3D(data, r1, r2, r3, tdps);
+			}
+			else 
+			{
+				decompressDataSeries_float_3D_pwr(data, r1, r2, r3, tdps);
+			}					
+			
+			return;
+		} else {
+			*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+			// insert the reserved values
+			//int[] rtypes = TypeManager.convertByteArray2IntArray_fast_1b(
+			//		dataSeriesLength, rtypeArray);
+			int* rtypes;
+			int validLength = computeBitNumRequired(dataSeriesLength);
+			decompressBitArraybySimpleLZ77(&rtypes, tdps->rtypeArray, tdps->rtypeArray_size, dataSeriesLength, validLength);
+			size_t count = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 1)
+					(*data)[i] = tdps->reservedValue;
+				else
+					count++;
+			}
+			// get the decompressed data
+			float* decmpData;
+			if(errBoundMode < PW_REL)
+				decompressDataSeries_float_3D(&decmpData, r1, r2, r3, tdps);
+			else 
+				decompressDataSeries_float_3D_pwr(&decmpData, r1, r2, r3, tdps);
+			// insert the decompressed data
+			size_t k = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 0) {
+					(*data)[i] = decmpData[k++];
+				}
+			}
+			free(decmpData);
+			free(rtypes);
+		}
+	}
+}
+
+void getSnapshotData_float_4D(float** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageF* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		float value = bytesToFloat(tdps->exactMidBytes);
+		*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		if (tdps->rtypeArray == NULL) {
+			if(errBoundMode < PW_REL)
+			{
+#ifdef HAVE_TIMECMPR					
+				if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				{
+					if(multisteps->compressionType == 0)
+						decompressDataSeries_float_4D(data, r1, r2, r3, r4, tdps);
+					else
+						decompressDataSeries_float_1D_ts(data, r1*r2*r3*r4, multisteps, tdps);					
+				}
+				else
+#endif				
+					decompressDataSeries_float_4D(data, r1, r2, r3, r4, tdps);
+			}
+			else 
+			{
+				decompressDataSeries_float_3D_pwr(data, r1*r2, r3, r4, tdps);
+				//ToDO
+				//decompressDataSeries_float_4D_pwr(data, r1, r2, r3, r4, tdps);
+			}					
+			return;
+		} else {
+			*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+			int* rtypes;
+			int validLength = computeBitNumRequired(dataSeriesLength);
+			decompressBitArraybySimpleLZ77(&rtypes, tdps->rtypeArray, tdps->rtypeArray_size, dataSeriesLength, validLength);
+			size_t count = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 1)
+					(*data)[i] = tdps->reservedValue;
+				else
+					count++;
+			}
+			// get the decompressed data
+			float* decmpData;
+			if(errBoundMode < PW_REL)
+				decompressDataSeries_float_4D(&decmpData, r1, r2, r3, r4, tdps);
+			else
+				decompressDataSeries_float_3D_pwr(&decmpData, r1*r2, r3, r4, tdps);
+				//ToDO
+				//decompressDataSeries_float_4D_pwr(&decompData, r1, r2, r3, r4, tdps);
+			// insert the decompressed data
+			size_t k = 0;
+			for (i = 0; i < dataSeriesLength; i++) {
+				if (rtypes[i] == 0) {
+					(*data)[i] = decmpData[k++];
+				}
+			}
+			free(decmpData);
+			free(rtypes);
+		}
+	}
+}
+
+size_t decompressDataSeries_float_3D_RA_block(float * data, float mean, size_t dim_0, size_t dim_1, size_t dim_2, size_t block_dim_0, size_t block_dim_1, size_t block_dim_2, double realPrecision, int * type, float * unpredictable_data){
+
+	size_t dim0_offset = dim_1 * dim_2;
+	size_t dim1_offset = dim_2;
+	// printf("SZ_compress_float_3D_MDQ_RA_block real dim: %d %d %d\n", real_block_dims[0], real_block_dims[1], real_block_dims[2]);
+	// fflush(stdout);
+
+	size_t unpredictable_count = 0;
+	size_t r1, r2, r3;
+	r1 = block_dim_0;
+	r2 = block_dim_1;
+	r3 = block_dim_2;
+
+	float * cur_data_pos = data;
+	float * last_row_pos;
+	float pred1D, pred2D, pred3D;
+	size_t i, j, k;
+	size_t r23 = r2*r3;
+	int type_;
+	// Process Row-0 data 0
+	pred1D = mean;
+	type_ = type[0];
+	// printf("Type 0 %d, mean %.4f\n", type_, mean);
+	if (type_ != 0){
+		cur_data_pos[0] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else{
+		cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+	}
+
+	/* Process Row-0 data 1*/
+	pred1D = cur_data_pos[0];
+	type_ = type[1];
+	if (type_ != 0){
+		cur_data_pos[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else{
+		cur_data_pos[1] = unpredictable_data[unpredictable_count ++];
+	}
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r3; j++){
+		pred1D = 2*cur_data_pos[j-1] - cur_data_pos[j-2];
+		type_ = type[j];
+		if (type_ != 0){
+			cur_data_pos[j] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+		}
+	}
+
+	last_row_pos = cur_data_pos;
+	cur_data_pos += dim1_offset;
+	// printf("SZ_compress_float_3D_MDQ_RA_block row 0 done, cur_data_pos: %ld\n", cur_data_pos - block_ori_data);
+	// fflush(stdout);
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r2; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r3;	
+		pred1D = last_row_pos[0];
+		type_ = type[index];
+		if (type_ != 0){
+			cur_data_pos[0] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+		}
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r3; j++)
+		{
+			index = i*r3+j;
+			pred2D = cur_data_pos[j-1] + last_row_pos[j] - last_row_pos[j-1];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[j] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+			}
+			// printf("pred2D %.2f cur_data %.2f last_row_data %.2f %.2f, result %.2f\n", pred2D, cur_data_pos[j-1], last_row_pos[j], last_row_pos[j-1], cur_data_pos[j]);
+			// getchar();
+		}
+		last_row_pos = cur_data_pos;
+		cur_data_pos += dim1_offset;
+	}
+	cur_data_pos += dim0_offset - r2 * dim1_offset;
+
+	// printf("SZ_compress_float_3D_MDQ_RA_block layer 0 done, cur_data_pos: %ld\n", cur_data_pos - block_ori_data);
+	// fflush(stdout);
+	// exit(0);
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (k = 1; k < r1; k++)
+	{
+		// if(idx == 63 && idy == 63 && idz == 63){
+		// 	printf("SZ_compress_float_3D_MDQ_RA_block layer %d done, cur_data_pos: %ld\n", k-1, cur_data_pos - data);
+		// 	fflush(stdout);
+		// }
+		/* Process Row-0 data 0*/
+		index = k*r23;
+		pred1D = cur_data_pos[- dim0_offset];
+		type_ = type[index];
+		if (type_ != 0){
+			cur_data_pos[0] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+		}
+	    /* Process Row-0 data 1 --> data r3-1 */
+		for (j = 1; j < r3; j++)
+		{
+			//index = k*r2*r3+j;
+			index ++;
+			pred2D = cur_data_pos[j-1] + cur_data_pos[j - dim0_offset] - cur_data_pos[j - 1 - dim0_offset];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[j] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+			}
+			// printf("pred2D %.2f cur_data %.2f %.2f %.2f, result %.2f\n", pred2D, cur_data_pos[j-1], cur_data_pos[j - dim0_offset], cur_data_pos[j - 1 - dim0_offset], cur_data_pos[j]);
+			// getchar();
+		}
+		last_row_pos = cur_data_pos;
+		cur_data_pos += dim1_offset;
+
+		// if(idx == 63 && idy == 63 && idz == 63){
+		// 	printf("SZ_compress_float_3D_MDQ_RA_block layer row 0 done, cur_data_pos: %ld\n", k-1, cur_data_pos - data);
+		// 	fflush(stdout);
+		// }
+
+	    /* Process Row-1 --> Row-r2-1 */
+		for (i = 1; i < r2; i++)
+		{
+			// if(idx == 63 && idy == 63 && idz == 63){
+			// 	printf("SZ_compress_float_3D_MDQ_RA_block layer row %d done, cur_data_pos: %ld\n", i-1, cur_data_pos - data);
+			// 	fflush(stdout);
+			// }
+			/* Process Row-i data 0 */
+			index = k*r23 + i*r3;
+			pred2D = last_row_pos[0] + cur_data_pos[- dim0_offset] - last_row_pos[- dim0_offset];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[0] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (j = 1; j < r3; j++)
+			{
+//				if(k==63&&i==43&&j==27)
+//					printf("i=%d\n", i);
+				//index = k*r2*r3 + i*r3 + j;			
+				index ++;
+				pred3D = cur_data_pos[j-1] + last_row_pos[j]+ cur_data_pos[j - dim0_offset] - last_row_pos[j-1] - last_row_pos[j - dim0_offset] - cur_data_pos[j-1 - dim0_offset] + last_row_pos[j-1 - dim0_offset];
+				type_ = type[index];
+				if (type_ != 0){
+					cur_data_pos[j] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else{
+					cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+				}
+			}
+			last_row_pos = cur_data_pos;
+			cur_data_pos += dim1_offset;
+		}
+		cur_data_pos += dim0_offset - r2 * dim1_offset;
+	}
+
+	return unpredictable_count;
+}
+
+size_t decompressDataSeries_float_1D_RA_block(float * data, float mean, size_t dim_0, size_t block_dim_0, double realPrecision, int * type, float * unpredictable_data){
+
+	size_t unpredictable_count = 0;
+	
+	float * cur_data_pos = data;
+	size_t type_index = 0;
+	int type_;
+	float last_over_thres = mean;
+	for(size_t i=0; i<block_dim_0; i++){
+		type_ = type[type_index];
+		if(type_ == 0){
+			cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+			last_over_thres = cur_data_pos[0];
+		}
+		else{
+			cur_data_pos[0] = last_over_thres + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			last_over_thres = cur_data_pos[0];
+		}
+
+		type_index ++;
+		cur_data_pos ++;
+	}
+
+	return unpredictable_count;
+}
+
+size_t decompressDataSeries_float_2D_RA_block(float * data, float mean, size_t dim_0, size_t dim_1, size_t block_dim_0, size_t block_dim_1, double realPrecision, int * type, float * unpredictable_data){
+
+	size_t dim0_offset = dim_1;
+	// printf("SZ_compress_float_3D_MDQ_RA_block real dim: %d %d %d\n", real_block_dims[0], real_block_dims[1], real_block_dims[2]);
+	// fflush(stdout);
+
+	size_t unpredictable_count = 0;
+	size_t r1, r2;
+	r1 = block_dim_0;
+	r2 = block_dim_1;
+
+	float * cur_data_pos = data;
+	float * last_row_pos;
+	float pred1D, pred2D;
+	size_t i, j;
+	int type_;
+	// Process Row-0 data 0
+	pred1D = mean;
+	type_ = type[0];
+	// printf("Type 0 %d, mean %.4f\n", type_, mean);
+	if (type_ != 0){
+		cur_data_pos[0] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else{
+		cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+	}
+
+	/* Process Row-0 data 1*/
+	pred1D = cur_data_pos[0];
+	type_ = type[1];
+	if (type_ != 0){
+		cur_data_pos[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else{
+		cur_data_pos[1] = unpredictable_data[unpredictable_count ++];
+	}
+    /* Process Row-0 data 2 --> data r3-1 */
+	for (j = 2; j < r2; j++){
+		pred1D = 2*cur_data_pos[j-1] - cur_data_pos[j-2];
+		type_ = type[j];
+		if (type_ != 0){
+			cur_data_pos[j] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+		}
+	}
+
+	last_row_pos = cur_data_pos;
+	cur_data_pos += dim0_offset;
+	// printf("SZ_compress_float_3D_MDQ_RA_block row 0 done, cur_data_pos: %ld\n", cur_data_pos - block_ori_data);
+	// fflush(stdout);
+
+	/* Process Row-1 --> Row-r2-1 */
+	size_t index;
+	for (i = 1; i < r1; i++)
+	{
+		/* Process row-i data 0 */
+		index = i*r2;	
+		type_ = type[index];
+		if (type_ != 0){
+			pred1D = last_row_pos[0];
+			cur_data_pos[0] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else{
+			cur_data_pos[0] = unpredictable_data[unpredictable_count ++];
+		}
+		/* Process row-i data 1 --> data r3-1*/
+		for (j = 1; j < r2; j++)
+		{
+			index = i*r2+j;
+			pred2D = cur_data_pos[j-1] + last_row_pos[j] - last_row_pos[j-1];
+			type_ = type[index];
+			if (type_ != 0){
+				cur_data_pos[j] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else{
+				cur_data_pos[j] = unpredictable_data[unpredictable_count ++];
+			}
+			// printf("pred2D %.2f cur_data %.2f last_row_data %.2f %.2f, result %.2f\n", pred2D, cur_data_pos[j-1], last_row_pos[j], last_row_pos[j-1], cur_data_pos[j]);
+			// getchar();
+		}
+		last_row_pos = cur_data_pos;
+		cur_data_pos += dim0_offset;
+	}
+	return unpredictable_count;
+}
+
diff --git a/thirdparty/SZ/sz/src/szd_float_pwr.c b/thirdparty/SZ/sz/src/szd_float_pwr.c
new file mode 100644
index 0000000000000000000000000000000000000000..4ab18341f0fe136627920e0dd8c6264dff802a14
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_float_pwr.c
@@ -0,0 +1,1352 @@
+/**
+ *  @file szd_float_pwr.c
+ *  @author Sheng Di
+ *  @date Aug, 2016
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageF.h"
+#include "CompressElement.h"
+#include "sz.h"
+#include "Huffman.h"
+#include "sz_float_pwr.h"
+//#include "rw.h"
+//
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wchar-subscripts"
+
+
+void decompressDataSeries_float_1D_pwr(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	unsigned char tmpPrecBytes[4] = {0}; //used when needing to convert bytes to float values
+	unsigned char* bp = tdps->pwrErrBoundBytes;
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+	unsigned char* leadNum;
+	float interval = 0;// = (float)tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+	
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqLength = 0, reqBytesLength = 0, resiBitsLength = 0, resiBits = 0; 
+	unsigned char leadingNum;	
+	float medianValue, exactData, predValue = 0, realPrecision = 0;
+	
+	medianValue = tdps->medianValue;
+	
+	int type_, updateReqLength = 0;
+	for (i = 0; i < dataSeriesLength; i++) 
+	{
+		if(i%tdps->segment_size==0)
+		{
+			tmpPrecBytes[0] = *(bp++);
+			tmpPrecBytes[1] = *(bp++);
+			tmpPrecBytes[2] = 0;
+			tmpPrecBytes[3] = 0;
+			realPrecision = bytesToFloat(tmpPrecBytes);
+			interval = realPrecision*2;
+			updateReqLength = 0;
+		}
+		
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;	
+				updateReqLength = 1;	
+			}
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data	
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToFloat(curBytes);
+			(*data)[i] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(leadNum);
+	free(type);
+	return;
+}
+
+float* extractRealPrecision_2D_float(size_t R1, size_t R2, int blockSize, TightDataPointStorageF* tdps)
+{
+	size_t i,j,k=0, I;
+	unsigned char* bytes = tdps->pwrErrBoundBytes;
+	unsigned char tmpBytes[4] = {0};
+	float* result = (float*)malloc(sizeof(float)*R1*R2);
+	for(i=0;i<R1;i++)
+	{
+		I = i*R2;
+		for(j=0;j<R2;j++)
+		{
+			tmpBytes[0] = bytes[k++];
+			tmpBytes[1] = bytes[k++];
+			result[I+j]=bytesToFloat(tmpBytes);
+		}
+	}
+	return result;
+}
+
+void decompressDataSeries_float_2D_pwr(float** data, size_t r1, size_t r2, TightDataPointStorageF* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	unsigned char* leadNum;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqLength, reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	float medianValue, exactData, realPrecision;
+	int type_;	
+	float pred1D, pred2D;
+	size_t ii, jj, II = 0, JJ = 0, updateReqLength = 1;
+
+	int blockSize = computeBlockEdgeSize_2D(tdps->segment_size);
+	size_t R1 = 1+(r1-1)/blockSize;
+	size_t R2 = 1+(r2-1)/blockSize;		
+	float* pwrErrBound = extractRealPrecision_2D_float(R1, R2, blockSize, tdps);
+
+	realPrecision = pwrErrBound[0];	
+	computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+
+	/* Process Row-0, data 0 */
+
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 4);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+
+	exactData = bytesToFloat(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,4);
+
+	/* Process Row-0, data 1 */
+	type_ = type[1];
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits		
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToFloat(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,4);
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{	
+		if(jj%blockSize==0)
+		{
+			II = 0;
+			JJ++;
+			realPrecision = pwrErrBound[JJ];
+			updateReqLength = 0;			
+		}
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];						
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+			
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */		
+		if(ii%blockSize==0)
+			II++;
+		JJ = 0;
+		realPrecision = pwrErrBound[II*R2+JJ];				
+		updateReqLength = 0;
+		
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+			
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			
+			if(jj%blockSize==0)
+				JJ++;
+			realPrecision = pwrErrBound[II*R2+JJ];			
+			updateReqLength = 0;			
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}				
+				
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+	}
+
+	free(pwrErrBound);
+	free(leadNum);
+	free(type);
+	return;
+}
+
+float* extractRealPrecision_3D_float(size_t R1, size_t R2, size_t R3, int blockSize, TightDataPointStorageF* tdps)
+{
+	size_t i,j,k=0, IR, JR, p = 0;
+	size_t R23 = R2*R3;
+	unsigned char* bytes = tdps->pwrErrBoundBytes;
+	unsigned char tmpBytes[4] = {0};
+	float* result = (float*)malloc(sizeof(float)*R1*R2*R3);
+	for(i=0;i<R1;i++)
+	{
+		IR = i*R23;
+		for(j=0;j<R2;j++)
+		{
+			JR = j*R3;
+			for(k=0;k<R3;k++)
+			{
+				tmpBytes[0] = bytes[p++];
+				tmpBytes[1] = bytes[p++];
+				result[IR+JR+k]=bytesToFloat(tmpBytes);				
+			}
+		}
+	}
+	return result;
+}
+
+void decompressDataSeries_float_3D_pwr(float** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageF* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+	// in resiMidBits, p is to track the
+	// byte_index of resiMidBits, l is for
+	// leadNum
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	unsigned char* leadNum;
+
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+
+	memset(preBytes, 0, 4);
+	size_t curByteIndex = 0;
+	int reqLength, reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;
+	float medianValue, exactData, realPrecision;
+	int type_;
+	float pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, II = 0, JJ = 0, KK = 0, updateReqLength = 1;
+
+	int blockSize = computeBlockEdgeSize_3D(tdps->segment_size);
+	size_t R1 = 1+(r1-1)/blockSize;
+	size_t R2 = 1+(r2-1)/blockSize;		
+	size_t R3 = 1+(r3-1)/blockSize;
+	size_t R23 = R2*R3;
+	float* pwrErrBound = extractRealPrecision_3D_float(R1, R2, R3, blockSize, tdps);
+
+	realPrecision = pwrErrBound[0];	
+	computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+	reqBytesLength = reqLength/8;
+	resiBitsLength = reqLength%8;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+	// compute resiBits
+	resiBits = 0;
+	if (resiBitsLength != 0) {
+		int kMod8 = k % 8;
+		int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+		if (rightMovSteps > 0) {
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+		} else if (rightMovSteps < 0) {
+			int code1 = getLeftMovingCode(kMod8);
+			int code2 = getRightMovingCode(kMod8, resiBitsLength);
+			int leftMovSteps = -rightMovSteps;
+			rightMovSteps = 8 - leftMovSteps;
+			resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+			p++;
+			resiBits = resiBits
+					| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+		} else // rightMovSteps == 0
+		{
+			int code = getRightMovingCode(kMod8, resiBitsLength);
+			resiBits = (tdps->residualMidBits[p] & code);
+			p++;
+		}
+		k += resiBitsLength;
+	}
+
+	// recover the exact data
+	memset(curBytes, 0, 4);
+	leadingNum = leadNum[l++];
+	memcpy(curBytes, preBytes, leadingNum);
+	for (j = leadingNum; j < reqBytesLength; j++)
+		curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+	if (resiBitsLength != 0) {
+		unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+		curBytes[reqBytesLength] = resiByte;
+	}
+	exactData = bytesToFloat(curBytes);
+	(*data)[0] = exactData + medianValue;
+	memcpy(preBytes,curBytes,4);
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// compute resiBits
+		resiBits = 0;
+		if (resiBitsLength != 0) {
+			int kMod8 = k % 8;
+			int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+			if (rightMovSteps > 0) {
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+			} else if (rightMovSteps < 0) {
+				int code1 = getLeftMovingCode(kMod8);
+				int code2 = getRightMovingCode(kMod8, resiBitsLength);
+				int leftMovSteps = -rightMovSteps;
+				rightMovSteps = 8 - leftMovSteps;
+				resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+				p++;
+				resiBits = resiBits
+						| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+			} else // rightMovSteps == 0
+			{
+				int code = getRightMovingCode(kMod8, resiBitsLength);
+				resiBits = (tdps->residualMidBits[p] & code);
+				p++;
+			}
+			k += resiBitsLength;
+		}
+
+		// recover the exact data
+		memset(curBytes, 0, 4);
+		leadingNum = leadNum[l++];
+		memcpy(curBytes, preBytes, leadingNum);
+		for (j = leadingNum; j < reqBytesLength; j++)
+			curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+		if (resiBitsLength != 0) {
+			unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+			curBytes[reqBytesLength] = resiByte;
+		}
+
+		exactData = bytesToFloat(curBytes);
+		(*data)[1] = exactData + medianValue;
+		memcpy(preBytes,curBytes,4);
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		if(jj%blockSize==0)
+		{
+			KK = 0;//dimension 1 (top)
+			II = 0;//dimension 2 (mid)
+			JJ++;
+			realPrecision = pwrErrBound[JJ];
+			updateReqLength = 0;			
+		}		
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[jj] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+	}
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */		
+		if(ii%blockSize==0)
+			II++;		
+		JJ = 0;
+		realPrecision = pwrErrBound[II*R3+JJ];
+		updateReqLength = 0;		
+
+		index = ii*r3;
+		
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r3];			
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+			
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+
+			if(jj%blockSize==0)
+				JJ++;
+			realPrecision = pwrErrBound[II*R3+JJ];			
+			updateReqLength = 0;			
+			
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];				
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}
+
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;		
+		if(kk%blockSize==0)
+			KK++;
+		II = 0;
+		JJ = 0;
+
+		realPrecision = pwrErrBound[KK*R23];			
+		updateReqLength = 0;			
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r23];			
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;				
+				updateReqLength = 1;
+			}
+
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+
+			exactData = bytesToFloat(curBytes);
+			(*data)[index] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+
+			if(jj%blockSize==0)
+				JJ++;
+
+			realPrecision = pwrErrBound[KK*R23+JJ];			
+			updateReqLength = 0;			
+			
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];			
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}
+			
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			
+			if(ii%blockSize==0)
+				II++;
+			JJ = 0;
+			
+			realPrecision = pwrErrBound[KK*R23+II*R3];			
+			updateReqLength = 0;						
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];				
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// compute resiBits
+				if(updateReqLength==0)
+				{
+					computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+					reqBytesLength = reqLength/8;
+					resiBitsLength = reqLength%8;				
+					updateReqLength = 1;
+				}
+
+				resiBits = 0;
+				if (resiBitsLength != 0) {
+					int kMod8 = k % 8;
+					int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+					if (rightMovSteps > 0) {
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+					} else if (rightMovSteps < 0) {
+						int code1 = getLeftMovingCode(kMod8);
+						int code2 = getRightMovingCode(kMod8, resiBitsLength);
+						int leftMovSteps = -rightMovSteps;
+						rightMovSteps = 8 - leftMovSteps;
+						resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+						p++;
+						resiBits = resiBits
+								| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+					} else // rightMovSteps == 0
+					{
+						int code = getRightMovingCode(kMod8, resiBitsLength);
+						resiBits = (tdps->residualMidBits[p] & code);
+						p++;
+					}
+					k += resiBitsLength;
+				}
+
+				// recover the exact data
+				memset(curBytes, 0, 4);
+				leadingNum = leadNum[l++];
+				memcpy(curBytes, preBytes, leadingNum);
+				for (j = leadingNum; j < reqBytesLength; j++)
+					curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+				if (resiBitsLength != 0) {
+					unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+					curBytes[reqBytesLength] = resiByte;
+				}
+
+				exactData = bytesToFloat(curBytes);
+				(*data)[index] = exactData + medianValue;
+				memcpy(preBytes,curBytes,4);
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				if(jj%blockSize==0)
+					JJ++;
+
+				realPrecision = pwrErrBound[KK*R23+II*R3+JJ];			
+				updateReqLength = 0;				
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];					
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					// compute resiBits
+					if(updateReqLength==0)
+					{
+						computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+						reqBytesLength = reqLength/8;
+						resiBitsLength = reqLength%8;				
+						updateReqLength = 1;
+					}
+				
+					resiBits = 0;
+					if (resiBitsLength != 0) {
+						int kMod8 = k % 8;
+						int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+						if (rightMovSteps > 0) {
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+						} else if (rightMovSteps < 0) {
+							int code1 = getLeftMovingCode(kMod8);
+							int code2 = getRightMovingCode(kMod8, resiBitsLength);
+							int leftMovSteps = -rightMovSteps;
+							rightMovSteps = 8 - leftMovSteps;
+							resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+							p++;
+							resiBits = resiBits
+									| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+						} else // rightMovSteps == 0
+						{
+							int code = getRightMovingCode(kMod8, resiBitsLength);
+							resiBits = (tdps->residualMidBits[p] & code);
+							p++;
+						}
+						k += resiBitsLength;
+					}
+
+					// recover the exact data
+					memset(curBytes, 0, 4);
+					leadingNum = leadNum[l++];
+					memcpy(curBytes, preBytes, leadingNum);
+					for (j = leadingNum; j < reqBytesLength; j++)
+						curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+					if (resiBitsLength != 0) {
+						unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+						curBytes[reqBytesLength] = resiByte;
+					}
+					
+					exactData = bytesToFloat(curBytes);
+					(*data)[index] = exactData + medianValue;
+					memcpy(preBytes,curBytes,4);
+				}
+			}
+		}
+
+	}
+
+	free(pwrErrBound);
+	free(leadNum);
+	free(type);
+	return;
+}
+
+void decompressDataSeries_float_1D_pwrgroup(float** data, size_t dataSeriesLength, TightDataPointStorageF* tdps) 
+{
+	float *posGroups, *negGroups, *groups;
+	float pos_01_group, neg_01_group;
+	int *posFlags, *negFlags;
+	
+	updateQuantizationInfo(tdps->intervals);
+	
+	unsigned char* leadNum;
+	double interval;// = (float)tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+	
+	createRangeGroups_float(&posGroups, &negGroups, &posFlags, &negFlags);
+	
+	float realGroupPrecision;
+	float realPrecision = tdps->realPrecision;
+	char* groupID = decompressGroupIDArray(tdps->pwrErrBoundBytes, tdps->dataSeriesLength);
+	
+	//note that the groupID values here are [1,2,3,....,18] or [-1,-2,...,-18]
+	
+	double* groupErrorBounds = generateGroupErrBounds(confparams_dec->errorBoundMode, realPrecision, confparams_dec->pw_relBoundRatio);
+	exe_params->intvRadius = generateGroupMaxIntervalCount(groupErrorBounds);
+		
+	size_t nbBins = (size_t)(1/confparams_dec->pw_relBoundRatio + 0.5);
+	if(nbBins%2==1)
+		nbBins++;
+	exe_params->intvRadius = nbBins;
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+	
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqLength, reqBytesLength = 0, resiBitsLength = 0, resiBits; 
+	unsigned char leadingNum;	
+	float medianValue, exactData, curValue, predValue;
+	
+	medianValue = tdps->medianValue;
+	
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+							// in resiMidBits, p is to track the
+							// byte_index of resiMidBits, l is for
+							// leadNum
+							
+	int type_, updateReqLength = 0;
+	char rawGrpID = 0, indexGrpID = 0;
+	for (i = 0; i < dataSeriesLength; i++) 
+	{
+		rawGrpID = groupID[i];
+		
+		if(rawGrpID >= 2)
+		{
+			groups = posGroups;
+			indexGrpID = rawGrpID - 2;
+		}
+		else if(rawGrpID <= -2)
+		{
+			groups = negGroups;
+			indexGrpID = -rawGrpID - 2;		}
+		else if(rawGrpID == 1)
+		{
+			groups = &pos_01_group;
+			indexGrpID = 0;
+		}
+		else //rawGrpID == -1
+		{
+			groups = &neg_01_group;
+			indexGrpID = 0;			
+		}
+		
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			if(updateReqLength==0)
+			{
+				computeReqLength_float(realPrecision, tdps->radExpo, &reqLength, &medianValue);
+				reqBytesLength = reqLength/8;
+				resiBitsLength = reqLength%8;	
+				updateReqLength = 1;	
+			}
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data	
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToFloat(curBytes);
+			exactData = exactData + medianValue;
+			(*data)[i] = exactData;
+			memcpy(preBytes,curBytes,4);
+			
+			groups[indexGrpID] = exactData;
+			
+			break;
+		default:
+			predValue = groups[indexGrpID]; //Here, groups[indexGrpID] is the previous value.
+			realGroupPrecision = groupErrorBounds[indexGrpID];
+			interval = realGroupPrecision*2;		
+			
+			curValue = predValue + (type_-exe_params->intvRadius)*interval;
+			
+			//groupNum = computeGroupNum_float(curValue);
+			
+			if((curValue>0&&rawGrpID<0)||(curValue<0&&rawGrpID>0))
+				curValue = 0;
+			//else
+			//{
+			//	realGrpID = fabs(rawGrpID)-2;
+			//	if(groupNum<realGrpID)
+			//		curValue = rawGrpID>0?pow(2,realGrpID):-pow(2,realGrpID);
+			//	else if(groupNum>realGrpID)
+			//		curValue = rawGrpID>0?pow(2,groupNum):-pow(2,groupNum);				
+			//}	
+				
+			(*data)[i] = curValue;
+			groups[indexGrpID] = curValue;
+			break;		
+		}
+	}	
+	
+	free(leadNum);
+	free(type);
+	
+	free(posGroups);
+	free(negGroups);
+	free(posFlags);
+	free(negFlags);
+	free(groupErrorBounds);
+	free(groupID);
+}
+#pragma GCC diagnostic pop
diff --git a/thirdparty/SZ/sz/src/szd_float_ts.c b/thirdparty/SZ/sz/src/szd_float_ts.c
new file mode 100644
index 0000000000000000000000000000000000000000..272ba90113daf220ee2f6c2d4d3a3fbad37dd4df
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_float_ts.c
@@ -0,0 +1,115 @@
+/**
+ *  @file szd_float_ts.c
+ *  @author Sheng Di and Dingwen Tao
+ *  @date Aug, 2016
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include "szd_float.h"
+#include "TightDataPointStorageF.h"
+#include "sz.h"
+#include "Huffman.h"
+#include "szd_float_ts.h"
+
+void decompressDataSeries_float_1D_ts(float** data, size_t dataSeriesLength, sz_multisteps* multisteps, TightDataPointStorageF* tdps) 
+{
+	float* lastSnapshotData = (float*)multisteps->hist_data;
+	updateQuantizationInfo(tdps->intervals);
+	size_t i, j, k = 0, p = 0, l = 0; // k is to track the location of residual_bit
+								// in resiMidBits, p is to track the
+								// byte_index of resiMidBits, l is for
+								// leadNum
+	unsigned char* leadNum;
+	double interval = tdps->realPrecision*2;
+	
+	convertByteArray2IntArray_fast_2b(tdps->exactDataNum, tdps->leadNumArray, tdps->leadNumArray_size, &leadNum);
+
+	*data = (float*)malloc(sizeof(float)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+	
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	unsigned char preBytes[4];
+	unsigned char curBytes[4];
+	
+	memset(preBytes, 0, 4);
+
+	size_t curByteIndex = 0;
+	int reqBytesLength, resiBitsLength, resiBits; 
+	unsigned char leadingNum;	
+	float medianValue, exactData, predValue = 0;
+	
+	reqBytesLength = tdps->reqLength/8;
+	resiBitsLength = tdps->reqLength%8;
+	medianValue = tdps->medianValue;
+	
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// compute resiBits
+			resiBits = 0;
+			if (resiBitsLength != 0) {
+				int kMod8 = k % 8;
+				int rightMovSteps = getRightMovingSteps(kMod8, resiBitsLength);
+				if (rightMovSteps > 0) {
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code) >> rightMovSteps;
+				} else if (rightMovSteps < 0) {
+					int code1 = getLeftMovingCode(kMod8);
+					int code2 = getRightMovingCode(kMod8, resiBitsLength);
+					int leftMovSteps = -rightMovSteps;
+					rightMovSteps = 8 - leftMovSteps;
+					resiBits = (tdps->residualMidBits[p] & code1) << leftMovSteps;
+					p++;
+					resiBits = resiBits
+							| ((tdps->residualMidBits[p] & code2) >> rightMovSteps);
+				} else // rightMovSteps == 0
+				{
+					int code = getRightMovingCode(kMod8, resiBitsLength);
+					resiBits = (tdps->residualMidBits[p] & code);
+					p++;
+				}
+				k += resiBitsLength;
+			}
+
+			// recover the exact data	
+			memset(curBytes, 0, 4);
+			leadingNum = leadNum[l++];
+			memcpy(curBytes, preBytes, leadingNum);
+			for (j = leadingNum; j < reqBytesLength; j++)
+				curBytes[j] = tdps->exactMidBytes[curByteIndex++];
+			if (resiBitsLength != 0) {
+				unsigned char resiByte = (unsigned char) (resiBits << (8 - resiBitsLength));
+				curBytes[reqBytesLength] = resiByte;
+			}
+			
+			exactData = bytesToFloat(curBytes);
+			(*data)[i] = exactData + medianValue;
+			memcpy(preBytes,curBytes,4);
+			break;
+		default:
+			//predValue = (*data)[i-1];
+			if(confparams_dec->szMode == SZ_TEMPORAL_COMPRESSION)
+				predValue = lastSnapshotData[i];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	
+	memcpy(multisteps->hist_data, (*data), dataSeriesLength*sizeof(float));
+	
+	free(leadNum);
+	free(type);
+	return;
+}
diff --git a/thirdparty/SZ/sz/src/szd_int16.c b/thirdparty/SZ/sz/src/szd_int16.c
new file mode 100644
index 0000000000000000000000000000000000000000..3c402dca944a658172e6c0db502961fb9c2a1721
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_int16.c
@@ -0,0 +1,915 @@
+/**
+ *  @file szd_int16.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_int16.h"
+#include "Huffman.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_int16(int16_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(int16_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+		if(cmpSize!=4+2+4+MetaDataByteLength && cmpSize!=4+2+8+MetaDataByteLength)
+	{
+		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
+		if(isZlib)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(int16_t);
+	if(tdps->isLossless)
+	{
+		*newData = (int16_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = bytesToInt16_bigEndian(p);
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_int16_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_int16_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_int16_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_int16_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(int16_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_int16_1D(int16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i;
+	double interval = tdps->realPrecision*2;
+	
+	*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	long predValue, tmp;
+	int16_t minValue, exactData;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT16);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			tmp = predValue + (type_-exe_params->intvRadius)*interval;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[i] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[i] = SZ_INT16_MIN;
+			else
+				(*data)[i] = SZ_INT16_MAX;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int16_2D(int16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int16_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT16);	
+	
+	long pred1D, pred2D, tmp;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToInt16_bigEndian(curBytes);
+	exactData = (uint16_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_INT16_MIN)
+			(*data)[1] = SZ_INT16_MIN;
+		else
+			(*data)[1] = SZ_INT16_MAX;
+			
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt16_bigEndian(curBytes);
+		exactData = (uint16_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[jj] = SZ_INT16_MIN;
+			else
+				(*data)[jj] = SZ_INT16_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[index] = SZ_INT16_MIN;
+			else
+				(*data)[index] = SZ_INT16_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int16_3D(int16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int16_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT16);	
+	
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToInt16_bigEndian(curBytes);
+	exactData = (uint16_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_INT16_MIN)
+			(*data)[1] = SZ_INT16_MIN;
+		else
+			(*data)[1] = SZ_INT16_MAX;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt16_bigEndian(curBytes);
+		exactData = (uint16_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[jj] = SZ_INT16_MIN;
+			else
+				(*data)[jj] = SZ_INT16_MAX;		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[index] = SZ_INT16_MIN;
+			else
+				(*data)[index] = SZ_INT16_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[index] = SZ_INT16_MIN;
+			else
+				(*data)[index] = SZ_INT16_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						(*data)[index] = SZ_INT16_MIN;
+					else
+						(*data)[index] = SZ_INT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_int16_4D(int16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int16_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT16);	
+	
+	int type_;
+
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt16_bigEndian(curBytes);
+		exactData = (uint16_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT16_MIN)
+				(*data)[index] = SZ_INT16_MIN;
+			else
+				(*data)[index] = SZ_INT16_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						(*data)[index] = SZ_INT16_MIN;
+					else
+						(*data)[index] = SZ_INT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT16_MIN)
+					(*data)[index] = SZ_INT16_MIN;
+				else
+					(*data)[index] = SZ_INT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						(*data)[index] = SZ_INT16_MIN;
+					else
+						(*data)[index] = SZ_INT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT16_MIN)
+						(*data)[index] = SZ_INT16_MIN;
+					else
+						(*data)[index] = SZ_INT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_INT16_MIN&&tmp<SZ_INT16_MAX)
+							(*data)[index] = tmp;
+						else if(tmp < SZ_INT16_MIN)
+							(*data)[index] = SZ_INT16_MIN;
+						else
+							(*data)[index] = SZ_INT16_MAX;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = bytesToInt16_bigEndian(curBytes);
+						exactData = (uint16_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_int16_1D(int16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		int16_t value = bytesToInt16_bigEndian(tdps->exactDataBytes);
+		*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int16_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_int16_2D(int16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		int16_t value = bytesToInt16_bigEndian(tdps->exactDataBytes);
+		*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int16_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_int16_3D(int16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		int16_t value = bytesToInt16_bigEndian(tdps->exactDataBytes);
+		*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int16_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_int16_4D(int16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		int16_t value = bytesToInt16_bigEndian(tdps->exactDataBytes);
+		*data = (int16_t*)malloc(sizeof(int16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int16_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/thirdparty/SZ/sz/src/szd_int32.c b/thirdparty/SZ/sz/src/szd_int32.c
new file mode 100644
index 0000000000000000000000000000000000000000..43dc74e170671d4a873a849f97ae4a38dce06aa1
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_int32.c
@@ -0,0 +1,788 @@
+/**
+ *  @file szd_int32.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_int32.h"
+#include "Huffman.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_int32(int32_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(int32_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+4+4+MetaDataByteLength && cmpSize!=4+4+8+MetaDataByteLength)
+	{
+		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
+		if(isZlib)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(int32_t);
+	if(tdps->isLossless)
+	{
+		*newData = (int32_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = bytesToInt32_bigEndian(p);
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_int32_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_int32_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_int32_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_int32_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(int32_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_int32_1D(int32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i;
+	double interval = tdps->realPrecision*2;
+	
+	*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	int32_t minValue, exactData, predValue;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT32);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int32_2D(int32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int32_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT32);	
+	
+	int32_t pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToInt32_bigEndian(curBytes);
+	exactData = (uint32_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt32_bigEndian(curBytes);
+		exactData = (uint32_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int32_3D(int32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int32_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT32);	
+	
+	int32_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToInt32_bigEndian(curBytes);
+	exactData = (uint32_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt32_bigEndian(curBytes);
+		exactData = (uint32_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_int32_4D(int32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int32_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT32);	
+	
+	int type_;
+
+	int32_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt32_bigEndian(curBytes);
+		exactData = (uint32_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = bytesToInt32_bigEndian(curBytes);
+						exactData = (uint32_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_int32_1D(int32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+
+	if (tdps->allSameData) {
+		int32_t value = bytesToInt32_bigEndian(tdps->exactDataBytes);
+		*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int32_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_int32_2D(int32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		int32_t value = bytesToInt32_bigEndian(tdps->exactDataBytes);
+		*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int32_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_int32_3D(int32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		int32_t value = bytesToInt32_bigEndian(tdps->exactDataBytes);
+		*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int32_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_int32_4D(int32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		int32_t value = bytesToInt32_bigEndian(tdps->exactDataBytes);
+		*data = (int32_t*)malloc(sizeof(int32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int32_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/thirdparty/SZ/sz/src/szd_int64.c b/thirdparty/SZ/sz/src/szd_int64.c
new file mode 100644
index 0000000000000000000000000000000000000000..aaa4a533fc195c4f2d52174cf7de5a333629a700
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_int64.c
@@ -0,0 +1,788 @@
+/**
+ *  @file szd_int64.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_int64.h"
+#include "Huffman.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_int64(int64_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(int64_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+8+4+MetaDataByteLength && cmpSize!=4+8+8+MetaDataByteLength)
+	{
+		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
+		if(isZlib)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(int64_t);
+	if(tdps->isLossless)
+	{
+		*newData = (int64_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = bytesToInt64_bigEndian(p);
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_int64_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_int64_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_int64_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_int64_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(int64_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_int64_1D(int64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i;
+	double interval = tdps->realPrecision*2;
+	
+	*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	int64_t minValue, exactData, predValue;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT64);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int64_2D(int64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int64_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT64);	
+	
+	int64_t pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToInt64_bigEndian(curBytes);
+	exactData = (uint64_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt64_bigEndian(curBytes);
+		exactData = (uint64_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int64_3D(int64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int64_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT64);	
+	
+	int64_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToInt64_bigEndian(curBytes);
+	exactData = (uint64_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt64_bigEndian(curBytes);
+		exactData = (uint64_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_int64_4D(int64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int64_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT64);	
+	
+	int type_;
+
+	int64_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToInt64_bigEndian(curBytes);
+		exactData = (uint64_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = bytesToInt64_bigEndian(curBytes);
+						exactData = (uint64_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_int64_1D(int64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		int64_t value = bytesToInt64_bigEndian(tdps->exactDataBytes);
+		*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int64_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_int64_2D(int64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		int64_t value = bytesToInt64_bigEndian(tdps->exactDataBytes);
+		*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int64_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_int64_3D(int64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		int64_t value = bytesToInt64_bigEndian(tdps->exactDataBytes);
+		*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int64_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_int64_4D(int64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		int64_t value = bytesToInt64_bigEndian(tdps->exactDataBytes);
+		*data = (int64_t*)malloc(sizeof(int64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int64_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/thirdparty/SZ/sz/src/szd_int8.c b/thirdparty/SZ/sz/src/szd_int8.c
new file mode 100644
index 0000000000000000000000000000000000000000..758e91733d8b9f0b34fef94e2262748c278c49d2
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_int8.c
@@ -0,0 +1,912 @@
+/**
+ *  @file szd_int8.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_int8.h"
+#include "Huffman.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_int8(int8_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(int8_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+1+4+MetaDataByteLength && cmpSize!=4+1+8+MetaDataByteLength)
+	{
+		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
+		if(isZlib)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(int8_t);
+	if(tdps->isLossless)
+	{
+		*newData = (int8_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = *p;
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_int8_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_int8_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_int8_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_int8_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(int8_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_int8_1D(int8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	double interval = tdps->realPrecision*2;
+	
+	*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	long predValue, tmp;
+	int8_t minValue, exactData;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT8);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (size_t i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			tmp = predValue + (type_-exe_params->intvRadius)*interval;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[i] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[i] = SZ_INT8_MIN;
+			else
+				(*data)[i] = SZ_INT8_MAX;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int8_2D(int8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int8_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT8);	
+	
+	long pred1D, pred2D, tmp;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = curBytes[0];
+	exactData = (uint8_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_INT8_MIN)
+			(*data)[1] = SZ_INT8_MIN;
+		else
+			(*data)[1] = SZ_INT8_MAX;
+			
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = curBytes[0];
+		exactData = (uint8_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[jj] = SZ_INT8_MIN;
+			else
+				(*data)[jj] = SZ_INT8_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[index] = SZ_INT8_MIN;
+			else
+				(*data)[index] = SZ_INT8_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_int8_3D(int8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int8_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT8);	
+	
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = curBytes[0];
+	exactData = (uint8_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_INT8_MIN)
+			(*data)[1] = SZ_INT8_MIN;
+		else
+			(*data)[1] = SZ_INT8_MAX;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = curBytes[0];
+		exactData = (uint8_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[jj] = SZ_INT8_MIN;
+			else
+				(*data)[jj] = SZ_INT8_MAX;		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[index] = SZ_INT8_MIN;
+			else
+				(*data)[index] = SZ_INT8_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[index] = SZ_INT8_MIN;
+			else
+				(*data)[index] = SZ_INT8_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						(*data)[index] = SZ_INT8_MIN;
+					else
+						(*data)[index] = SZ_INT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_int8_4D(int8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	int8_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_INT8);	
+	
+	int type_;
+
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = curBytes[0];
+		exactData = (uint8_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_INT8_MIN)
+				(*data)[index] = SZ_INT8_MIN;
+			else
+				(*data)[index] = SZ_INT8_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						(*data)[index] = SZ_INT8_MIN;
+					else
+						(*data)[index] = SZ_INT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_INT8_MIN)
+					(*data)[index] = SZ_INT8_MIN;
+				else
+					(*data)[index] = SZ_INT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						(*data)[index] = SZ_INT8_MIN;
+					else
+						(*data)[index] = SZ_INT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_INT8_MIN)
+						(*data)[index] = SZ_INT8_MIN;
+					else
+						(*data)[index] = SZ_INT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_INT8_MIN&&tmp<SZ_INT8_MAX)
+							(*data)[index] = tmp;
+						else if(tmp < SZ_INT8_MIN)
+							(*data)[index] = SZ_INT8_MIN;
+						else
+							(*data)[index] = SZ_INT8_MAX;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = curBytes[0];
+						exactData = (uint8_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_int8_1D(int8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		int8_t value = tdps->exactDataBytes[0];
+		*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int8_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_int8_2D(int8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		int8_t value = tdps->exactDataBytes[0];
+		*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int8_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_int8_3D(int8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		int8_t value = tdps->exactDataBytes[0];
+		*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int8_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_int8_4D(int8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		int8_t value = tdps->exactDataBytes[0];
+		*data = (int8_t*)malloc(sizeof(int8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_int8_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/thirdparty/SZ/sz/src/szd_uint16.c b/thirdparty/SZ/sz/src/szd_uint16.c
new file mode 100644
index 0000000000000000000000000000000000000000..bdc746990aa25d1566bc5be27b949e38da305b16
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_uint16.c
@@ -0,0 +1,914 @@
+/**
+ *  @file szd_uint16.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_uint16.h"
+#include "Huffman.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_uint16(uint16_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(uint16_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+2+4+MetaDataByteLength && cmpSize!=4+2+8+MetaDataByteLength)
+	{
+		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
+		if(isZlib)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(uint16_t);
+	if(tdps->isLossless)
+	{
+		*newData = (uint16_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = bytesToUInt16_bigEndian(p);
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_uint16_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_uint16_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_uint16_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_uint16_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(uint16_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_uint16_1D(uint16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	double interval = tdps->realPrecision*2;
+	
+	*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	long predValue, tmp;
+	uint16_t minValue, exactData;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT16);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (size_t i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			tmp = predValue + (type_-exe_params->intvRadius)*interval;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[i] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[i] = SZ_UINT16_MIN;
+			else
+				(*data)[i] = SZ_UINT16_MAX;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint16_2D(uint16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint16_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT16);	
+	
+	long pred1D, pred2D, tmp;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToUInt16_bigEndian(curBytes);
+	exactData = (uint16_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_UINT16_MIN)
+			(*data)[1] = SZ_UINT16_MIN;
+		else
+			(*data)[1] = SZ_UINT16_MAX;
+			
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt16_bigEndian(curBytes);
+		exactData = (uint16_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[jj] = SZ_UINT16_MIN;
+			else
+				(*data)[jj] = SZ_UINT16_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[index] = SZ_UINT16_MIN;
+			else
+				(*data)[index] = SZ_UINT16_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint16_3D(uint16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint16_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT16);	
+	
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToUInt16_bigEndian(curBytes);
+	exactData = (uint16_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_UINT16_MIN)
+			(*data)[1] = SZ_UINT16_MIN;
+		else
+			(*data)[1] = SZ_UINT16_MAX;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt16_bigEndian(curBytes);
+		exactData = (uint16_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[jj] = SZ_UINT16_MIN;
+			else
+				(*data)[jj] = SZ_UINT16_MAX;		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[index] = SZ_UINT16_MIN;
+			else
+				(*data)[index] = SZ_UINT16_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[index] = SZ_UINT16_MIN;
+			else
+				(*data)[index] = SZ_UINT16_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						(*data)[index] = SZ_UINT16_MIN;
+					else
+						(*data)[index] = SZ_UINT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_uint16_4D(uint16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint16_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT16);	
+	
+	int type_;
+
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt16_bigEndian(curBytes);
+		exactData = (uint16_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT16_MIN)
+				(*data)[index] = SZ_UINT16_MIN;
+			else
+				(*data)[index] = SZ_UINT16_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt16_bigEndian(curBytes);
+			exactData = (uint16_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						(*data)[index] = SZ_UINT16_MIN;
+					else
+						(*data)[index] = SZ_UINT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT16_MIN)
+					(*data)[index] = SZ_UINT16_MIN;
+				else
+					(*data)[index] = SZ_UINT16_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt16_bigEndian(curBytes);
+				exactData = (uint16_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						(*data)[index] = SZ_UINT16_MIN;
+					else
+						(*data)[index] = SZ_UINT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT16_MIN)
+						(*data)[index] = SZ_UINT16_MIN;
+					else
+						(*data)[index] = SZ_UINT16_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt16_bigEndian(curBytes);
+					exactData = (uint16_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_UINT16_MIN&&tmp<SZ_UINT16_MAX)
+							(*data)[index] = tmp;
+						else if(tmp < SZ_UINT16_MIN)
+							(*data)[index] = SZ_UINT16_MIN;
+						else
+							(*data)[index] = SZ_UINT16_MAX;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = bytesToUInt16_bigEndian(curBytes);
+						exactData = (uint16_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_uint16_1D(uint16_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+
+	if (tdps->allSameData) {
+		uint16_t value = bytesToUInt16_bigEndian(tdps->exactDataBytes);
+		*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint16_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_uint16_2D(uint16_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		uint16_t value = bytesToUInt16_bigEndian(tdps->exactDataBytes);
+		*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint16_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_uint16_3D(uint16_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		uint16_t value = bytesToUInt16_bigEndian(tdps->exactDataBytes);
+		*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint16_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_uint16_4D(uint16_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		uint16_t value = bytesToUInt16_bigEndian(tdps->exactDataBytes);
+		*data = (uint16_t*)malloc(sizeof(uint16_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint16_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/thirdparty/SZ/sz/src/szd_uint32.c b/thirdparty/SZ/sz/src/szd_uint32.c
new file mode 100644
index 0000000000000000000000000000000000000000..795eabe6248b4ac4460008a0aa9ae511a514486d
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_uint32.c
@@ -0,0 +1,788 @@
+/**
+ *  @file szd_uint32.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_uint32.h"
+#include "Huffman.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_uint32(uint32_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(uint32_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+4+4+MetaDataByteLength && cmpSize!=4+4+8+MetaDataByteLength)
+	{
+		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
+		if(isZlib)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(uint32_t);
+	if(tdps->isLossless)
+	{
+		*newData = (uint32_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = bytesToUInt32_bigEndian(p);
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_uint32_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_uint32_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_uint32_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_uint32_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(uint32_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);	
+	return status;
+}
+
+
+void decompressDataSeries_uint32_1D(uint32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i;
+	double interval = tdps->realPrecision*2;
+	
+	*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	uint32_t minValue, exactData, predValue;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT32);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint32_2D(uint32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint32_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT32);	
+	
+	uint32_t pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToUInt32_bigEndian(curBytes);
+	exactData = (uint32_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt32_bigEndian(curBytes);
+		exactData = (uint32_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint32_3D(uint32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint32_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT32);	
+	
+	uint32_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToUInt32_bigEndian(curBytes);
+	exactData = (uint32_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt32_bigEndian(curBytes);
+		exactData = (uint32_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_uint32_4D(uint32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint32_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT32);	
+	
+	int type_;
+
+	uint32_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt32_bigEndian(curBytes);
+		exactData = (uint32_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt32_bigEndian(curBytes);
+			exactData = (uint32_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt32_bigEndian(curBytes);
+				exactData = (uint32_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt32_bigEndian(curBytes);
+					exactData = (uint32_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = bytesToUInt32_bigEndian(curBytes);
+						exactData = (uint32_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_uint32_1D(uint32_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		uint32_t value = bytesToUInt32_bigEndian(tdps->exactDataBytes);
+		*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint32_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_uint32_2D(uint32_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		uint32_t value = bytesToUInt32_bigEndian(tdps->exactDataBytes);
+		*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint32_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_uint32_3D(uint32_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		uint32_t value = bytesToUInt32_bigEndian(tdps->exactDataBytes);
+		*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint32_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_uint32_4D(uint32_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		uint32_t value = bytesToUInt32_bigEndian(tdps->exactDataBytes);
+		*data = (uint32_t*)malloc(sizeof(uint32_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint32_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/thirdparty/SZ/sz/src/szd_uint64.c b/thirdparty/SZ/sz/src/szd_uint64.c
new file mode 100644
index 0000000000000000000000000000000000000000..df2b8383ea60dbccc1e868ed39ce0f46021546fe
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_uint64.c
@@ -0,0 +1,788 @@
+/**
+ *  @file szd_uint64.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2016 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_uint64.h"
+#include "Huffman.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_uint64(uint64_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(uint64_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+8+4+MetaDataByteLength && cmpSize!=4+8+8+MetaDataByteLength)
+	{
+		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
+		if(isZlib)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(uint64_t);
+	if(tdps->isLossless)
+	{
+		*newData = (uint64_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = bytesToUInt64_bigEndian(p);
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_uint64_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_uint64_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_uint64_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_uint64_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(uint64_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_uint64_1D(uint64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i;
+	double interval = tdps->realPrecision*2;
+	
+	*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	uint64_t minValue, exactData, predValue;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT64);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			(*data)[i] = predValue + (type_-exe_params->intvRadius)*interval;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint64_2D(uint64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint64_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT64);	
+	
+	uint64_t pred1D, pred2D;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToUInt64_bigEndian(curBytes);
+	exactData = (uint64_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt64_bigEndian(curBytes);
+		exactData = (uint64_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint64_3D(uint64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint64_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT64);	
+	
+	uint64_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = bytesToUInt64_bigEndian(curBytes);
+	exactData = (uint64_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		(*data)[1] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt64_bigEndian(curBytes);
+		exactData = (uint64_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			(*data)[jj] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_uint64_4D(uint64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint64_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT64);	
+	
+	int type_;
+
+	uint64_t pred1D, pred2D, pred3D;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = bytesToUInt64_bigEndian(curBytes);
+		exactData = (uint64_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = bytesToUInt64_bigEndian(curBytes);
+			exactData = (uint64_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				(*data)[index] = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = bytesToUInt64_bigEndian(curBytes);
+				exactData = (uint64_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					(*data)[index] = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = bytesToUInt64_bigEndian(curBytes);
+					exactData = (uint64_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						(*data)[index] = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = bytesToUInt64_bigEndian(curBytes);
+						exactData = (uint64_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_uint64_1D(uint64_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		uint64_t value = bytesToUInt64_bigEndian(tdps->exactDataBytes);
+		*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint64_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_uint64_2D(uint64_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		uint64_t value = bytesToUInt64_bigEndian(tdps->exactDataBytes);
+		*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint64_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_uint64_3D(uint64_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		uint64_t value = bytesToUInt64_bigEndian(tdps->exactDataBytes);
+		*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint64_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_uint64_4D(uint64_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		uint64_t value = bytesToUInt64_bigEndian(tdps->exactDataBytes);
+		*data = (uint64_t*)malloc(sizeof(uint64_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint64_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/thirdparty/SZ/sz/src/szd_uint8.c b/thirdparty/SZ/sz/src/szd_uint8.c
new file mode 100644
index 0000000000000000000000000000000000000000..cd616352bc950d4723b16357f7fc9e5dec820fe8
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szd_uint8.c
@@ -0,0 +1,913 @@
+/**
+ *  @file szd_uint8.c
+ *  @author Sheng Di
+ *  @date Aug, 2017
+ *  @brief 
+ *  (C) 2017 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+#include <stdlib.h> 
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "TightDataPointStorageI.h"
+#include "sz.h"
+#include "szd_uint8.h"
+#include "Huffman.h"
+
+/**
+ * 
+ * 
+ * @return status SUCCESSFUL (SZ_SCES) or not (other error codes) f
+ * */
+int SZ_decompress_args_uint8(uint8_t** newData, size_t r5, size_t r4, size_t r3, size_t r2, size_t r1, unsigned char* cmpBytes, size_t cmpSize)
+{
+	int status = SZ_SCES;
+	size_t dataLength = computeDataLength(r5,r4,r3,r2,r1);
+	
+	//unsigned char* tmpBytes;
+	size_t targetUncompressSize = dataLength <<2; //i.e., *4
+	//tmpSize must be "much" smaller than dataLength
+	size_t i, tmpSize = 3+MetaDataByteLength+1+sizeof(uint8_t)+exe_params->SZ_SIZE_TYPE;
+	unsigned char* szTmpBytes;	
+		
+	if(cmpSize!=4+1+4+MetaDataByteLength && cmpSize!=4+1+8+MetaDataByteLength)
+	{
+		int isZlib = isZlibFormat(cmpBytes[0], cmpBytes[1]);
+		if(isZlib)
+			confparams_dec->szMode = SZ_BEST_COMPRESSION;
+		else
+			confparams_dec->szMode = SZ_BEST_SPEED;		
+		if(confparams_dec->szMode==SZ_BEST_SPEED)
+		{
+			tmpSize = cmpSize;
+			szTmpBytes = cmpBytes;	
+		}
+		else if(confparams_dec->szMode==SZ_BEST_COMPRESSION || confparams_dec->szMode==SZ_DEFAULT_COMPRESSION)
+		{
+			if(targetUncompressSize<MIN_ZLIB_DEC_ALLOMEM_BYTES) //Considering the minimum size
+				targetUncompressSize = MIN_ZLIB_DEC_ALLOMEM_BYTES; 
+			tmpSize = zlib_uncompress5(cmpBytes, (unsigned long)cmpSize, &szTmpBytes, (unsigned long)targetUncompressSize+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE);//		(unsigned long)targetUncompressSize+8: consider the total length under lossless compression mode is actually 3+4+1+targetUncompressSize
+			//szTmpBytes = (unsigned char*)malloc(sizeof(unsigned char)*tmpSize);
+			//memcpy(szTmpBytes, tmpBytes, tmpSize);
+			//free(tmpBytes); //release useless memory		
+		}
+		else
+		{
+			printf("Wrong value of confparams_dec->szMode in the double compressed bytes.\n");
+			status = SZ_MERR;
+			return status;
+		}	
+	}
+	else
+		szTmpBytes = cmpBytes;
+	//TODO: convert szTmpBytes to data array.
+	TightDataPointStorageI* tdps;
+	int errBoundMode = new_TightDataPointStorageI_fromFlatBytes(&tdps, szTmpBytes, tmpSize);
+	//writeByteData(tdps->typeArray, tdps->typeArray_size, "decompress-typebytes.tbt");
+	int dim = computeDimension(r5,r4,r3,r2,r1);	
+	int intSize = sizeof(uint8_t);
+	if(tdps->isLossless)
+	{
+		*newData = (uint8_t*)malloc(intSize*dataLength);
+		if(sysEndianType==BIG_ENDIAN_SYSTEM)
+		{
+			memcpy(*newData, szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE, dataLength*intSize);
+		}
+		else
+		{
+			unsigned char* p = szTmpBytes+4+MetaDataByteLength+exe_params->SZ_SIZE_TYPE;
+			for(i=0;i<dataLength;i++,p+=intSize)
+				(*newData)[i] = *p;
+		}		
+	}
+	else if (dim == 1)
+		getSnapshotData_uint8_1D(newData,r1,tdps, errBoundMode);
+	else
+	if (dim == 2)
+		getSnapshotData_uint8_2D(newData,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 3)
+		getSnapshotData_uint8_3D(newData,r3,r2,r1,tdps, errBoundMode);
+	else
+	if (dim == 4)
+		getSnapshotData_uint8_4D(newData,r4,r3,r2,r1,tdps, errBoundMode);
+	else
+	{
+		printf("Error: currently support only at most 4 dimensions!\n");
+		status = SZ_DERR;
+	}
+	free_TightDataPointStorageI2(tdps);
+	if(confparams_dec->szMode!=SZ_BEST_SPEED && cmpSize!=4+sizeof(uint8_t)+exe_params->SZ_SIZE_TYPE+MetaDataByteLength)
+		free(szTmpBytes);
+	return status;
+}
+
+
+void decompressDataSeries_uint8_1D(uint8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t i;
+	double interval = tdps->realPrecision*2;
+	
+	*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	//sdi:Debug
+	//writeUShortData(type, dataSeriesLength, "decompressStateBytes.sb");
+	
+	long predValue, tmp;
+	uint8_t minValue, exactData;
+	
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT8);
+	if(rightShiftBits<0)
+	{
+		printf("Error: rightShift < 0!\n");
+		exit(0);
+	}
+	int type_;
+	for (i = 0; i < dataSeriesLength; i++) {
+		type_ = type[i];
+		switch (type_) {
+		case 0:
+			// recover the exact data	
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[i] = exactData + minValue;
+			break;
+		default:
+			//predValue = 2 * (*data)[i-1] - (*data)[i-2];
+			predValue = (*data)[i-1];
+			tmp = predValue + (type_-exe_params->intvRadius)*interval;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[i] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[i] = SZ_UINT8_MIN;
+			else
+				(*data)[i] = SZ_UINT8_MAX;
+			break;
+		}
+		//printf("%.30G\n",(*data)[i]);
+	}
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint8_2D(uint8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	//printf("tdps->intervals=%d, exe_params->intvRadius=%d\n", tdps->intervals, exe_params->intvRadius);
+	
+	size_t dataSeriesLength = r1*r2;
+	//	printf ("%d %d\n", r1, r2);
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint8_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT8);	
+	
+	long pred1D, pred2D, tmp;
+	size_t ii, jj;
+
+	/* Process Row-0, data 0 */
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = curBytes[0];
+	exactData = (uint8_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	int type_ = type[1]; 
+	if (type_ != 0)
+	{
+		pred1D = (*data)[0];
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_UINT8_MIN)
+			(*data)[1] = SZ_UINT8_MIN;
+		else
+			(*data)[1] = SZ_UINT8_MAX;
+			
+	}
+	else
+	{
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = curBytes[0];
+		exactData = (uint8_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+
+	/* Process Row-0, data 2 --> data r2-1 */
+	for (jj = 2; jj < r2; jj++)
+	{
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			pred1D = 2*(*data)[jj-1] - (*data)[jj-2];				
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[jj] = SZ_UINT8_MIN;
+			else
+				(*data)[jj] = SZ_UINT8_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r1-1 */
+	for (ii = 1; ii < r1; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r2;
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			pred1D = (*data)[index-r2];		
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[index] = SZ_UINT8_MIN;
+			else
+				(*data)[index] = SZ_UINT8_MAX;
+		}
+		else
+		{
+			// recover the exact data
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r2-1*/
+		for (jj = 1; jj < r2; jj++)
+		{
+			index = ii*r2+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r2] - (*data)[index-r2-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				// recover the exact data
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void decompressDataSeries_uint8_3D(uint8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps) 
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3;
+	size_t r23 = r2*r3;
+//	printf ("%d %d %d\n", r1, r2, r3);
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint8_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT8);	
+	
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk;
+
+	///////////////////////////	Process layer-0 ///////////////////////////
+	/* Process Row-0 data 0*/
+
+	// recover the exact data
+	memcpy(curBytes, exactDataBytePointer, exactByteSize);
+	exactData = curBytes[0];
+	exactData = (uint8_t)exactData >> rightShiftBits;
+	exactDataBytePointer += exactByteSize;
+	(*data)[0] = exactData + minValue;
+
+	/* Process Row-0, data 1 */
+	pred1D = (*data)[0];
+
+	int type_ = type[1];
+	if (type_ != 0)
+	{
+		tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+		if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+			(*data)[1] = tmp;
+		else if(tmp < SZ_UINT8_MIN)
+			(*data)[1] = SZ_UINT8_MIN;
+		else
+			(*data)[1] = SZ_UINT8_MAX;
+	}
+	else
+	{
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = curBytes[0];
+		exactData = (uint8_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[1] = exactData + minValue;
+	}
+	/* Process Row-0, data 2 --> data r3-1 */
+	for (jj = 2; jj < r3; jj++)
+	{
+		pred1D = 2*(*data)[jj-1] - (*data)[jj-2];
+
+		type_ = type[jj];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[jj] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[jj] = SZ_UINT8_MIN;
+			else
+				(*data)[jj] = SZ_UINT8_MAX;		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[jj] = exactData + minValue;
+		}
+	}
+
+	size_t index;
+	/* Process Row-1 --> Row-r2-1 */
+	for (ii = 1; ii < r2; ii++)
+	{
+		/* Process row-ii data 0 */
+		index = ii*r3;
+		pred1D = (*data)[index-r3];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[index] = SZ_UINT8_MIN;
+			else
+				(*data)[index] = SZ_UINT8_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process row-ii data 1 --> r3-1*/
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = ii*r3+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r3] - (*data)[index-r3-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+	}
+
+	///////////////////////////	Process layer-1 --> layer-r1-1 ///////////////////////////
+
+	for (kk = 1; kk < r1; kk++)
+	{
+		/* Process Row-0 data 0*/
+		index = kk*r23;
+		pred1D = (*data)[index-r23];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[index] = SZ_UINT8_MIN;
+			else
+				(*data)[index] = SZ_UINT8_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0 data 1 --> data r3-1 */
+		for (jj = 1; jj < r3; jj++)
+		{
+			index = kk*r23+jj;
+			pred2D = (*data)[index-1] + (*data)[index-r23] - (*data)[index-r23-1];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r2-1 */
+		for (ii = 1; ii < r2; ii++)
+		{
+			/* Process Row-i data 0 */
+			index = kk*r23 + ii*r3;
+			pred2D = (*data)[index-r3] + (*data)[index-r23] - (*data)[index-r23-r3];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-i data 1 --> data r3-1 */
+			for (jj = 1; jj < r3; jj++)
+			{
+				index = kk*r23 + ii*r3 + jj;
+				pred3D = (*data)[index-1] + (*data)[index-r3] + (*data)[index-r23]
+					- (*data)[index-r3-1] - (*data)[index-r23-r3] - (*data)[index-r23-1] + (*data)[index-r23-r3-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						(*data)[index] = SZ_UINT8_MIN;
+					else
+						(*data)[index] = SZ_UINT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+
+void decompressDataSeries_uint8_4D(uint8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps)
+{
+	updateQuantizationInfo(tdps->intervals);
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	size_t r234 = r2*r3*r4;
+	size_t r34 = r3*r4;
+
+	double realPrecision = tdps->realPrecision;
+
+	*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+	int* type = (int*)malloc(dataSeriesLength*sizeof(int));
+	
+	HuffmanTree* huffmanTree = createHuffmanTree(tdps->stateNum);
+	decode_withTree(huffmanTree, tdps->typeArray, dataSeriesLength, type);
+	SZ_ReleaseHuffman(huffmanTree);	
+
+	uint8_t minValue, exactData;
+
+	minValue = tdps->minValue;
+	
+	int exactByteSize = tdps->exactByteSize;
+	unsigned char* exactDataBytePointer = tdps->exactDataBytes;
+	
+	unsigned char curBytes[8] = {0,0,0,0,0,0,0,0};
+	
+	int rightShiftBits = computeRightShiftBits(exactByteSize, SZ_UINT8);	
+	
+	int type_;
+
+	long pred1D, pred2D, pred3D, tmp;
+	size_t ii, jj, kk, ll;
+	size_t index;
+
+	for (ll = 0; ll < r1; ll++)
+	{
+		///////////////////////////	Process layer-0 ///////////////////////////
+		/* Process Row-0 data 0*/
+		index = ll*r234;
+		// recover the exact data
+		memcpy(curBytes, exactDataBytePointer, exactByteSize);
+		exactData = curBytes[0];
+		exactData = (uint8_t)exactData >> rightShiftBits;
+		exactDataBytePointer += exactByteSize;
+		(*data)[index] = exactData + minValue;
+
+		/* Process Row-0, data 1 */
+		index = ll*r234+1;
+
+		pred1D = (*data)[index-1];
+
+		type_ = type[index];
+		if (type_ != 0)
+		{
+			tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+			if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+				(*data)[index] = tmp;
+			else if(tmp < SZ_UINT8_MIN)
+				(*data)[index] = SZ_UINT8_MIN;
+			else
+				(*data)[index] = SZ_UINT8_MAX;
+		}
+		else
+		{
+			memcpy(curBytes, exactDataBytePointer, exactByteSize);
+			exactData = curBytes[0];
+			exactData = (uint8_t)exactData >> rightShiftBits;
+			exactDataBytePointer += exactByteSize;
+			(*data)[index] = exactData + minValue;
+		}
+
+		/* Process Row-0, data 2 --> data r4-1 */
+		for (jj = 2; jj < r4; jj++)
+		{
+			index = ll*r234+jj;
+
+			pred1D = 2*(*data)[index-1] - (*data)[index-2];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+		}
+
+		/* Process Row-1 --> Row-r3-1 */
+		for (ii = 1; ii < r3; ii++)
+		{
+			/* Process row-ii data 0 */
+			index = ll*r234+ii*r4;
+
+			pred1D = (*data)[index-r4];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process row-ii data 1 --> r4-1*/
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+ii*r4+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r4] - (*data)[index-r4-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						(*data)[index] = SZ_UINT8_MIN;
+					else
+						(*data)[index] = SZ_UINT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+			}
+		}
+
+		///////////////////////////	Process layer-1 --> layer-r2-1 ///////////////////////////
+
+		for (kk = 1; kk < r2; kk++)
+		{
+			/* Process Row-0 data 0*/
+			index = ll*r234+kk*r34;
+
+			pred1D = (*data)[index-r34];
+
+			type_ = type[index];
+			if (type_ != 0)
+			{
+				tmp = pred1D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+				if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+					(*data)[index] = tmp;
+				else if(tmp < SZ_UINT8_MIN)
+					(*data)[index] = SZ_UINT8_MIN;
+				else
+					(*data)[index] = SZ_UINT8_MAX;
+			}
+			else
+			{
+				memcpy(curBytes, exactDataBytePointer, exactByteSize);
+				exactData = curBytes[0];
+				exactData = (uint8_t)exactData >> rightShiftBits;
+				exactDataBytePointer += exactByteSize;
+				(*data)[index] = exactData + minValue;
+			}
+
+			/* Process Row-0 data 1 --> data r4-1 */
+			for (jj = 1; jj < r4; jj++)
+			{
+				index = ll*r234+kk*r34+jj;
+
+				pred2D = (*data)[index-1] + (*data)[index-r34] - (*data)[index-r34-1];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						(*data)[index] = SZ_UINT8_MIN;
+					else
+						(*data)[index] = SZ_UINT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;				
+				}
+			}
+
+			/* Process Row-1 --> Row-r3-1 */
+			for (ii = 1; ii < r3; ii++)
+			{
+				/* Process Row-i data 0 */
+				index = ll*r234+kk*r34+ii*r4;
+
+				pred2D = (*data)[index-r4] + (*data)[index-r34] - (*data)[index-r34-r4];
+
+				type_ = type[index];
+				if (type_ != 0)
+				{
+					tmp = pred2D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+					if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+						(*data)[index] = tmp;
+					else if(tmp < SZ_UINT8_MIN)
+						(*data)[index] = SZ_UINT8_MIN;
+					else
+						(*data)[index] = SZ_UINT8_MAX;
+				}
+				else
+				{
+					memcpy(curBytes, exactDataBytePointer, exactByteSize);
+					exactData = curBytes[0];
+					exactData = (uint8_t)exactData >> rightShiftBits;
+					exactDataBytePointer += exactByteSize;
+					(*data)[index] = exactData + minValue;
+				}
+
+				/* Process Row-i data 1 --> data r4-1 */
+				for (jj = 1; jj < r4; jj++)
+				{
+					index = ll*r234+kk*r34+ii*r4+jj;
+
+					pred3D = (*data)[index-1] + (*data)[index-r4] + (*data)[index-r34]
+							- (*data)[index-r4-1] - (*data)[index-r34-r4] - (*data)[index-r34-1] + (*data)[index-r34-r4-1];
+
+					type_ = type[index];
+					if (type_ != 0)
+					{
+						tmp = pred3D + 2 * (type_ - exe_params->intvRadius) * realPrecision;
+						if(tmp >= SZ_UINT8_MIN&&tmp<SZ_UINT8_MAX)
+							(*data)[index] = tmp;
+						else if(tmp < SZ_UINT8_MIN)
+							(*data)[index] = SZ_UINT8_MIN;
+						else
+							(*data)[index] = SZ_UINT8_MAX;
+					}
+					else
+					{
+						memcpy(curBytes, exactDataBytePointer, exactByteSize);
+						exactData = curBytes[0];
+						exactData = (uint8_t)exactData >> rightShiftBits;
+						exactDataBytePointer += exactByteSize;
+						(*data)[index] = exactData + minValue;
+					}
+				}
+			}
+		}
+	}
+
+	free(type);
+	return;
+}
+
+void getSnapshotData_uint8_1D(uint8_t** data, size_t dataSeriesLength, TightDataPointStorageI* tdps, int errBoundMode)
+{	
+	size_t i;
+
+	if (tdps->allSameData) {
+		uint8_t value = tdps->exactDataBytes[0];
+		*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint8_1D(data, dataSeriesLength, tdps);
+	}
+}
+
+void getSnapshotData_uint8_2D(uint8_t** data, size_t r1, size_t r2, TightDataPointStorageI* tdps, int errBoundMode) 
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2;
+	if (tdps->allSameData) {
+		uint8_t value = tdps->exactDataBytes[0];
+		*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint8_2D(data, r1, r2, tdps);
+	}
+}
+
+void getSnapshotData_uint8_3D(uint8_t** data, size_t r1, size_t r2, size_t r3, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3;
+	if (tdps->allSameData) {
+		uint8_t value = tdps->exactDataBytes[0];
+		*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint8_3D(data, r1, r2, r3, tdps);
+	}
+}
+
+void getSnapshotData_uint8_4D(uint8_t** data, size_t r1, size_t r2, size_t r3, size_t r4, TightDataPointStorageI* tdps, int errBoundMode)
+{
+	size_t i;
+	size_t dataSeriesLength = r1*r2*r3*r4;
+	if (tdps->allSameData) {
+		uint8_t value = tdps->exactDataBytes[0];
+		*data = (uint8_t*)malloc(sizeof(uint8_t)*dataSeriesLength);
+		for (i = 0; i < dataSeriesLength; i++)
+			(*data)[i] = value;
+	} else {
+		decompressDataSeries_uint8_4D(data, r1, r2, r3, r4, tdps);
+	}
+}
diff --git a/thirdparty/SZ/sz/src/szf.c b/thirdparty/SZ/sz/src/szf.c
new file mode 100644
index 0000000000000000000000000000000000000000..e3cca0bc2e26211b422acc5b6588e766fe548f92
--- /dev/null
+++ b/thirdparty/SZ/sz/src/szf.c
@@ -0,0 +1,567 @@
+/**
+ *  @file szf.c
+ *  @author Sheng Di
+ *  @date April, 2015
+ *  @brief the key C binding file to connect Fortran and C
+ *  (C) 2015 by Mathematics and Computer Science (MCS), Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "sz.h"
+#include "szf.h"
+
+//special notice: all the function names in this file must be lower-cases!!
+void sz_init_c_(char *configFile,int *len,int *ierr)
+{
+    int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=configFile[i];
+    s2[*len]='\0';
+ //   printf("sconfigFile=%s\n",configFile);
+    *ierr = SZ_Init(s2);
+}
+
+void sz_finalize_c_()
+{
+	SZ_Finalize();
+}
+
+//compress with config (without args in function)
+void sz_compress_d1_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1)	
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_FLOAT, data, outSize, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);	
+	free(tmp_bytes);
+}
+
+void sz_compress_d1_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1)	
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_FLOAT, data, reservedValue, outSize, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);	
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_FLOAT, data, outSize, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_FLOAT, data, reservedValue, outSize, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_FLOAT, data, outSize, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_FLOAT, data, reservedValue, outSize, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_FLOAT, data, outSize, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_FLOAT, data, reservedValue, outSize, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_float_(float* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_FLOAT, data, outSize, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_float_rev_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_FLOAT, data, reservedValue, outSize, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d1_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_DOUBLE, data, outSize, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d1_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_DOUBLE, data, reservedValue, outSize, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_DOUBLE, data, outSize, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_DOUBLE, data, reservedValue, outSize, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_DOUBLE, data, outSize, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_DOUBLE, data, reservedValue, outSize, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_DOUBLE, data, outSize, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_DOUBLE, data, reservedValue, outSize, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_double_(double* data, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress(SZ_DOUBLE, data, outSize, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_double_rev_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev(SZ_DOUBLE, data, reservedValue, outSize, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+//compress with args
+
+void sz_compress_d1_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_float_args_(float* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_FLOAT, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d1_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_double_args_(double* data, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress_args(SZ_DOUBLE, data, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0.1, 1, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+//--------------
+
+void sz_compress_d1_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_FLOAT, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_FLOAT, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_FLOAT, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d4_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_FLOAT, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_float_rev_args_(float* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_FLOAT, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d1_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_DOUBLE, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, 0, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d2_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_DOUBLE, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d3_double_rev_args_(double* data, float *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_DOUBLE, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+}
+
+void sz_compress_d4_double_rev_args_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_DOUBLE, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, 0, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+void sz_compress_d5_double_rev_args_(double* data, double *reservedValue, unsigned char *bytes, size_t *outSize, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	unsigned char *tmp_bytes = SZ_compress_rev_args(SZ_DOUBLE, data, reservedValue, outSize, *errBoundMode, *absErrBound, *relBoundRatio, *r5, *r4, *r3, *r2, *r1);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+
+//decompress
+
+void sz_decompress_d1_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1)
+{
+	float *tmp_data = SZ_decompress(SZ_FLOAT, bytes, *byteLength, 0, 0, 0, 0, *r1);
+	memcpy(data, tmp_data, (*r1)*sizeof(float));
+	free(tmp_data);
+}
+
+void sz_decompress_d2_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2)
+{
+	size_t r;
+	float *tmp_data = SZ_decompress(SZ_FLOAT, bytes, *byteLength, 0, 0, 0, *r2, *r1);
+	r=(*r1)*(*r2);
+	memcpy(data, tmp_data, r*sizeof(float));
+	free(tmp_data);
+}
+
+void sz_decompress_d3_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3)
+{
+	size_t r;
+	float *tmp_data = SZ_decompress(SZ_FLOAT, bytes, *byteLength, 0, 0, *r3, *r2, *r1);
+	r=(*r1)*(*r2)*(*r3);
+	memcpy(data, tmp_data, r*sizeof(float));
+	free(tmp_data);
+}
+
+void sz_decompress_d4_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	size_t r;
+	float *tmp_data = SZ_decompress(SZ_FLOAT, bytes, *byteLength, 0, *r4, *r3, *r2, *r1);
+	r=(*r1)*(*r2)*(*r3)*(*r4);
+	memcpy(data, tmp_data, r*sizeof(float));
+	free(tmp_data);
+}
+
+void sz_decompress_d5_float_(unsigned char *bytes, size_t *byteLength, float *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	size_t r;
+	float *tmp_data = SZ_decompress(SZ_FLOAT, bytes, *byteLength, *r5, *r4, *r3, *r2, *r1);
+	r=(*r1)*(*r2)*(*r3)*(*r4)*(*r5);
+	memcpy(data, tmp_data, r*sizeof(float));
+	free(tmp_data);
+}
+
+void sz_decompress_d1_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1)
+{
+	double *tmp_data = SZ_decompress(SZ_DOUBLE, bytes, *byteLength, 0, 0, 0, 0, *r1);
+	memcpy(data, tmp_data, (*r1)*sizeof(double));
+	free(tmp_data);
+}
+
+void sz_decompress_d2_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2)
+{
+	size_t r;
+	double *tmp_data = SZ_decompress(SZ_DOUBLE, bytes, *byteLength, 0, 0, 0, *r2, *r1);
+	r=(*r1)*(*r2);
+	memcpy(data, tmp_data, r*sizeof(double));
+	free(tmp_data);
+}
+
+void sz_decompress_d3_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3)
+{
+	size_t r;
+	double *tmp_data = SZ_decompress(SZ_DOUBLE, bytes, *byteLength, 0, 0, *r3, *r2, *r1);
+	r=(*r1)*(*r2)*(*r3);
+	memcpy(data, tmp_data, r*sizeof(double));
+	free(tmp_data);
+}
+
+void sz_decompress_d4_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	size_t r;
+	double *tmp_data = SZ_decompress(SZ_DOUBLE, bytes, *byteLength, 0, *r4, *r3, *r2, *r1);
+	r=(*r1)*(*r2)*(*r3)*(*r4);
+	memcpy(data, tmp_data, r*sizeof(double));
+	free(tmp_data);
+}
+
+void sz_decompress_d5_double_(unsigned char *bytes, size_t *byteLength, double *data, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	size_t r;
+	double *tmp_data = SZ_decompress(SZ_DOUBLE, bytes, *byteLength, *r5, *r4, *r3, *r2, *r1);
+	r=(*r1)*(*r2)*(*r3)*(*r4)*(*r5);
+	memcpy(data, tmp_data, r*sizeof(double));
+	free(tmp_data);
+}
+
+//-----------------TODO: batch mode-----------
+void sz_batchaddvar_d1_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, 0, *r1);
+}
+void sz_batchaddvar_d2_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, *r2, *r1);
+}
+void sz_batchaddvar_d3_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, *r3, *r2, *r1);
+}
+void sz_batchaddvar_d4_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, *r4, *r3, *r2, *r1);
+}
+void sz_batchaddvar_d5_float_(char* varName, int *len, float* data, int *errBoundMode, float *absErrBound, float *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(s2, SZ_FLOAT, data, *errBoundMode, *absErrBound, *relBoundRatio, *r5, *r4, *r3, *r2, *r1);
+}
+void sz_batchaddvar_d1_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, 0, *r1);
+}
+void sz_batchaddvar_d2_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, 0, *r2, *r1);
+}
+void sz_batchaddvar_d3_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, 0, *r3, *r2, *r1);
+}
+void sz_batchaddvar_d4_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, 0, *r4, *r3, *r2, *r1);
+}
+void sz_batchaddvar_d5_double_(char* varName, int *len, double* data, int *errBoundMode, double *absErrBound, double *relBoundRatio, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';		
+	SZ_batchAddVar(s2, SZ_DOUBLE, data, *errBoundMode, *absErrBound, *relBoundRatio, *r5, *r4, *r3, *r2, *r1);
+}
+void sz_batchdelvar_c_(char* varName, int *len, int *errState)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';
+	*errState = SZ_batchDelVar(s2);
+}
+void sz_batch_compress_c_(unsigned char* bytes, size_t *outSize)
+{
+	unsigned char* tmp_bytes = SZ_batch_compress(outSize);
+	memcpy(bytes, tmp_bytes, *outSize);
+	free(tmp_bytes);
+}
+void sz_batch_decompress_c_(unsigned char* bytes, size_t *byteLength, int *ierr)
+{
+	SZ_batch_decompress(bytes, *byteLength, ierr);
+}
+
+void sz_getvardim_c_(char* varName, int *len, int *dim, size_t *r1, size_t *r2, size_t *r3, size_t *r4, size_t *r5)
+{
+	int i;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';
+    
+    SZ_getVarData(s2, r5, r4, r3, r2, r1);
+    *dim = computeDimension(*r5, *r4, *r3, *r2, *r1);
+}
+
+void compute_total_batch_size_c_(size_t *totalSize)
+{
+	*totalSize = compute_total_batch_size();
+}
+
+void sz_getvardata_float_(char* varName, int *len, float* data)
+{
+	int i;
+	size_t r1, r2, r3, r4, r5;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';	
+	
+	float* tmp_data = (float*)SZ_getVarData(s2, &r5, &r4, &r3, &r2, &r1);
+	int size = computeDataLength(r5, r4, r3, r2, r1);
+	memcpy(data, tmp_data, size*sizeof(float));
+	free(tmp_data);	
+}
+void sz_getvardata_double_(char* varName, int *len, double* data)
+{
+	int i;
+	size_t r1, r2, r3, r4, r5;
+    char s2[*len+1];
+    for(i=0;i<*len;i++)
+        s2[i]=varName[i];
+    s2[*len]='\0';	
+    
+	double* tmp_data = (double*)SZ_getVarData(s2, &r5, &r4, &r3, &r2, &r1);
+	int size = computeDataLength(r5, r4, r3, r2, r1);
+	memcpy(data, tmp_data, size*sizeof(double));
+	//free(tmp_data);
+}
+
+void sz_freevarset_c_(int *mode)
+{
+	SZ_freeVarSet(*mode);
+}
+